lamindb 0.76.8__py3-none-any.whl → 0.76.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +113 -113
- lamindb/_artifact.py +1205 -1205
- lamindb/_can_validate.py +579 -579
- lamindb/_collection.py +389 -387
- lamindb/_curate.py +1601 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +242 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +256 -256
- lamindb/_from_values.py +382 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +362 -362
- lamindb/_record.py +649 -649
- lamindb/_run.py +57 -57
- lamindb/_save.py +308 -308
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +127 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -94
- lamindb/core/_context.py +574 -574
- lamindb/core/_data.py +438 -438
- lamindb/core/_feature_manager.py +867 -867
- lamindb/core/_label_manager.py +253 -253
- lamindb/core/_mapped_collection.py +631 -597
- lamindb/core/_settings.py +187 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +581 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -90
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -164
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +740 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -204
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -172
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +158 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/LICENSE +201 -201
- {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/METADATA +4 -4
- lamindb-0.76.9.dist-info/RECORD +60 -0
- {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/WHEEL +1 -1
- lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_parents.py
CHANGED
@@ -1,476 +1,476 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import builtins
|
4
|
-
from typing import TYPE_CHECKING, Literal
|
5
|
-
|
6
|
-
import lamindb_setup as ln_setup
|
7
|
-
from lamin_utils import logger
|
8
|
-
from lnschema_core import Artifact, Collection, Record, Run, Transform
|
9
|
-
from lnschema_core.models import HasParents, format_field_value
|
10
|
-
|
11
|
-
from lamindb._utils import attach_func_to_class_method
|
12
|
-
|
13
|
-
from ._record import get_name_field
|
14
|
-
|
15
|
-
if TYPE_CHECKING:
|
16
|
-
from lnschema_core.types import StrField
|
17
|
-
|
18
|
-
LAMIN_GREEN_LIGHTER = "#10b981"
|
19
|
-
LAMIN_GREEN_DARKER = "#065f46"
|
20
|
-
GREEN_FILL = "honeydew"
|
21
|
-
TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
|
22
|
-
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
23
|
-
|
24
|
-
|
25
|
-
def _transform_emoji(transform: Transform):
|
26
|
-
if transform is not None:
|
27
|
-
return TRANSFORM_EMOJIS.get(transform.type, "💫")
|
28
|
-
else:
|
29
|
-
return TRANSFORM_EMOJIS["pipeline"]
|
30
|
-
|
31
|
-
|
32
|
-
def _view(u):
|
33
|
-
from graphviz.backend import ExecutableNotFound
|
34
|
-
|
35
|
-
try:
|
36
|
-
if is_run_from_ipython:
|
37
|
-
from IPython import get_ipython
|
38
|
-
from IPython.display import display
|
39
|
-
|
40
|
-
# True if the code is running in a Jupyter Notebook or Lab environment
|
41
|
-
if get_ipython().__class__.__name__ == "TerminalInteractiveShell":
|
42
|
-
return u.view()
|
43
|
-
else:
|
44
|
-
# call u._repr_mimebundle_() manually that exception gets raised properly and not just printed by
|
45
|
-
# call to display()
|
46
|
-
display(u._repr_mimebundle_(), raw=True)
|
47
|
-
else:
|
48
|
-
return u
|
49
|
-
except (FileNotFoundError, RuntimeError, ExecutableNotFound): # pragma: no cover
|
50
|
-
logger.error(
|
51
|
-
"please install the graphviz executable on your system:\n - Ubuntu: `sudo"
|
52
|
-
" apt-get install graphviz`\n - Windows:"
|
53
|
-
" https://graphviz.org/download/#windows\n - Mac: `brew install graphviz`"
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
|
-
def view_parents(
|
58
|
-
self,
|
59
|
-
field: StrField | None = None,
|
60
|
-
with_children: bool = False,
|
61
|
-
distance: int = 5,
|
62
|
-
):
|
63
|
-
if field is None:
|
64
|
-
field = get_name_field(self)
|
65
|
-
if not isinstance(field, str):
|
66
|
-
field = field.field.name
|
67
|
-
|
68
|
-
return _view_parents(
|
69
|
-
record=self, field=field, with_children=with_children, distance=distance
|
70
|
-
)
|
71
|
-
|
72
|
-
|
73
|
-
def view_lineage(data: Artifact | Collection, with_children: bool = True) -> None:
|
74
|
-
"""Graph of data flow.
|
75
|
-
|
76
|
-
Notes:
|
77
|
-
For more info, see use cases: :doc:`docs:data-flow`.
|
78
|
-
|
79
|
-
Examples:
|
80
|
-
>>> collection.view_lineage()
|
81
|
-
>>> artifact.view_lineage()
|
82
|
-
"""
|
83
|
-
import graphviz
|
84
|
-
|
85
|
-
df_values = _get_all_parent_runs(data)
|
86
|
-
if with_children:
|
87
|
-
df_values += _get_all_child_runs(data)
|
88
|
-
df_edges = _df_edges_from_runs(df_values)
|
89
|
-
|
90
|
-
data_label = _record_label(data)
|
91
|
-
|
92
|
-
def add_node(
|
93
|
-
record: Run | Artifact | Collection,
|
94
|
-
node_id: str,
|
95
|
-
node_label: str,
|
96
|
-
u: graphviz.Digraph,
|
97
|
-
):
|
98
|
-
if isinstance(record, Run):
|
99
|
-
fillcolor = "gainsboro"
|
100
|
-
else:
|
101
|
-
fillcolor = GREEN_FILL
|
102
|
-
u.node(
|
103
|
-
node_id,
|
104
|
-
label=node_label,
|
105
|
-
shape="box",
|
106
|
-
style="rounded,filled",
|
107
|
-
fillcolor=fillcolor,
|
108
|
-
)
|
109
|
-
|
110
|
-
u = graphviz.Digraph(
|
111
|
-
f"{data._meta.model_name}_{data.uid}",
|
112
|
-
node_attr={
|
113
|
-
"fillcolor": GREEN_FILL,
|
114
|
-
"color": LAMIN_GREEN_DARKER,
|
115
|
-
"fontname": "Helvetica",
|
116
|
-
"fontsize": "10",
|
117
|
-
},
|
118
|
-
edge_attr={"arrowsize": "0.5"},
|
119
|
-
)
|
120
|
-
|
121
|
-
for _, row in df_edges.iterrows():
|
122
|
-
add_node(row["source_record"], row["source"], row["source_label"], u)
|
123
|
-
if row["target_record"] not in df_edges["source_record"]:
|
124
|
-
add_node(row["target_record"], row["target"], row["target_label"], u)
|
125
|
-
|
126
|
-
u.edge(row["source"], row["target"], color="dimgrey")
|
127
|
-
# label the searched file
|
128
|
-
u.node(
|
129
|
-
f"{data._meta.model_name}_{data.uid}",
|
130
|
-
label=data_label,
|
131
|
-
style="rounded,filled",
|
132
|
-
fillcolor=LAMIN_GREEN_LIGHTER,
|
133
|
-
shape="box",
|
134
|
-
)
|
135
|
-
|
136
|
-
_view(u)
|
137
|
-
|
138
|
-
|
139
|
-
def _view_parents(
|
140
|
-
record: Record,
|
141
|
-
field: str,
|
142
|
-
with_children: bool = False,
|
143
|
-
distance: int = 100,
|
144
|
-
attr_name: Literal["parents", "predecessors"] = "parents",
|
145
|
-
):
|
146
|
-
"""Graph of parents."""
|
147
|
-
if not hasattr(record, attr_name):
|
148
|
-
raise NotImplementedError(
|
149
|
-
f"Parents view is not supported for {record.__class__.__name__}!"
|
150
|
-
)
|
151
|
-
import graphviz
|
152
|
-
import pandas as pd
|
153
|
-
|
154
|
-
df_edges = None
|
155
|
-
df_edges_parents = _df_edges_from_parents(
|
156
|
-
record=record, field=field, distance=distance, attr_name=attr_name
|
157
|
-
)
|
158
|
-
if df_edges_parents is not None:
|
159
|
-
df_edges = df_edges_parents
|
160
|
-
if with_children:
|
161
|
-
df_edges_children = _df_edges_from_parents(
|
162
|
-
record=record,
|
163
|
-
field=field,
|
164
|
-
distance=distance,
|
165
|
-
children=True,
|
166
|
-
attr_name=attr_name,
|
167
|
-
)
|
168
|
-
if df_edges_children is not None:
|
169
|
-
if df_edges is not None:
|
170
|
-
df_edges = pd.concat(
|
171
|
-
[df_edges_parents, df_edges_children]
|
172
|
-
).drop_duplicates()
|
173
|
-
else:
|
174
|
-
df_edges = df_edges_children
|
175
|
-
|
176
|
-
record_label = _record_label(record, field)
|
177
|
-
|
178
|
-
u = graphviz.Digraph(
|
179
|
-
record.uid,
|
180
|
-
node_attr={
|
181
|
-
"color": LAMIN_GREEN_DARKER,
|
182
|
-
"fillcolor": GREEN_FILL,
|
183
|
-
"shape": "box",
|
184
|
-
"style": "rounded,filled",
|
185
|
-
"fontname": "Helvetica",
|
186
|
-
"fontsize": "10",
|
187
|
-
},
|
188
|
-
edge_attr={"arrowsize": "0.5"},
|
189
|
-
)
|
190
|
-
u.node(
|
191
|
-
record.uid,
|
192
|
-
label=(
|
193
|
-
_record_label(record)
|
194
|
-
if record.__class__.__name__ == "Transform"
|
195
|
-
else _add_emoji(record, record_label)
|
196
|
-
),
|
197
|
-
fillcolor=LAMIN_GREEN_LIGHTER,
|
198
|
-
)
|
199
|
-
if df_edges is not None:
|
200
|
-
for _, row in df_edges.iterrows():
|
201
|
-
u.node(row["source"], label=row["source_label"])
|
202
|
-
u.node(row["target"], label=row["target_label"])
|
203
|
-
u.edge(row["source"], row["target"], color="dimgrey")
|
204
|
-
|
205
|
-
_view(u)
|
206
|
-
|
207
|
-
|
208
|
-
def _get_parents(
|
209
|
-
record: Record,
|
210
|
-
field: str,
|
211
|
-
distance: int,
|
212
|
-
children: bool = False,
|
213
|
-
attr_name: Literal["parents", "predecessors"] = "parents",
|
214
|
-
):
|
215
|
-
"""Recursively get parent records within a distance."""
|
216
|
-
if children:
|
217
|
-
key = attr_name
|
218
|
-
else:
|
219
|
-
key = "children" if attr_name == "parents" else "successors" # type: ignore
|
220
|
-
model = record.__class__
|
221
|
-
condition = f"{key}__{field}"
|
222
|
-
results = model.filter(**{condition: record.__getattribute__(field)}).all()
|
223
|
-
if distance < 2:
|
224
|
-
return results
|
225
|
-
|
226
|
-
d = 2
|
227
|
-
while d < distance:
|
228
|
-
condition = f"{key}__{condition}"
|
229
|
-
records = model.filter(**{condition: record.__getattribute__(field)})
|
230
|
-
|
231
|
-
try:
|
232
|
-
if not records.exists():
|
233
|
-
return results
|
234
|
-
|
235
|
-
results = results | records.all()
|
236
|
-
d += 1
|
237
|
-
except Exception:
|
238
|
-
# For OperationalError:
|
239
|
-
# SQLite does not support joins containing more than 64 tables
|
240
|
-
return results
|
241
|
-
return results
|
242
|
-
|
243
|
-
|
244
|
-
def _df_edges_from_parents(
|
245
|
-
record: Record,
|
246
|
-
field: str,
|
247
|
-
distance: int,
|
248
|
-
children: bool = False,
|
249
|
-
attr_name: Literal["parents", "predecessors"] = "parents",
|
250
|
-
):
|
251
|
-
"""Construct a DataFrame of edges as the input of graphviz.Digraph."""
|
252
|
-
if attr_name == "parents":
|
253
|
-
key = "children" if children else "parents"
|
254
|
-
else:
|
255
|
-
key = "successors" if children else "predecessors"
|
256
|
-
parents = _get_parents(
|
257
|
-
record=record,
|
258
|
-
field=field,
|
259
|
-
distance=distance,
|
260
|
-
children=children,
|
261
|
-
attr_name=attr_name,
|
262
|
-
)
|
263
|
-
all = record.__class__.objects
|
264
|
-
records = parents | all.filter(id=record.id)
|
265
|
-
df = records.distinct().df(include=[f"{key}__id"])
|
266
|
-
if f"{key}__id" not in df.columns:
|
267
|
-
return None
|
268
|
-
df_edges = df[[f"{key}__id"]]
|
269
|
-
df_edges = df_edges.explode(f"{key}__id")
|
270
|
-
df_edges.index.name = "target"
|
271
|
-
df_edges = df_edges.reset_index()
|
272
|
-
df_edges.dropna(axis=0, inplace=True)
|
273
|
-
df_edges.rename(columns={f"{key}__id": "source"}, inplace=True)
|
274
|
-
df_edges = df_edges.drop_duplicates()
|
275
|
-
|
276
|
-
# colons messes with the node formatting:
|
277
|
-
# https://graphviz.readthedocs.io/en/stable/node_ports.html
|
278
|
-
df_edges["source_record"] = df_edges["source"].apply(lambda x: all.get(id=x))
|
279
|
-
df_edges["target_record"] = df_edges["target"].apply(lambda x: all.get(id=x))
|
280
|
-
if record.__class__.__name__ == "Transform":
|
281
|
-
df_edges["source_label"] = df_edges["source_record"].apply(_record_label)
|
282
|
-
df_edges["target_label"] = df_edges["target_record"].apply(_record_label)
|
283
|
-
else:
|
284
|
-
df_edges["source_label"] = df_edges["source_record"].apply(
|
285
|
-
lambda x: _record_label(x, field)
|
286
|
-
)
|
287
|
-
df_edges["target_label"] = df_edges["target_record"].apply(
|
288
|
-
lambda x: _record_label(x, field)
|
289
|
-
)
|
290
|
-
df_edges["source"] = df_edges["source_record"].apply(lambda x: x.uid)
|
291
|
-
df_edges["target"] = df_edges["target_record"].apply(lambda x: x.uid)
|
292
|
-
return df_edges
|
293
|
-
|
294
|
-
|
295
|
-
def _record_label(record: Record, field: str | None = None):
|
296
|
-
if isinstance(record, Artifact):
|
297
|
-
if record.description is None:
|
298
|
-
name = record.key
|
299
|
-
else:
|
300
|
-
name = record.description.replace("&", "&")
|
301
|
-
|
302
|
-
return (
|
303
|
-
rf'<📄 {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
304
|
-
rf' FACE="Monospace">uid={record.uid}<BR/>suffix={record.suffix}</FONT>>'
|
305
|
-
)
|
306
|
-
elif isinstance(record, Collection):
|
307
|
-
name = record.name.replace("&", "&")
|
308
|
-
return (
|
309
|
-
rf'<🍱 {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
310
|
-
rf' FACE="Monospace">uid={record.uid}<BR/>version={record.version}</FONT>>'
|
311
|
-
)
|
312
|
-
elif isinstance(record, Run):
|
313
|
-
name = f'{record.transform.name.replace("&", "&")}'
|
314
|
-
user_display = (
|
315
|
-
record.created_by.handle
|
316
|
-
if record.created_by.name is None
|
317
|
-
else record.created_by.name
|
318
|
-
)
|
319
|
-
return (
|
320
|
-
rf'<{TRANSFORM_EMOJIS.get(str(record.transform.type), "💫")} {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
321
|
-
rf' FACE="Monospace">uid={record.transform.uid}<BR/>type={record.transform.type},'
|
322
|
-
rf" user={user_display}<BR/>run={format_field_value(record.started_at)}</FONT>>"
|
323
|
-
)
|
324
|
-
elif isinstance(record, Transform):
|
325
|
-
name = f'{record.name.replace("&", "&")}'
|
326
|
-
return (
|
327
|
-
rf'<{TRANSFORM_EMOJIS.get(str(record.type), "💫")} {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
328
|
-
rf' FACE="Monospace">uid={record.uid}<BR/>type={record.type},'
|
329
|
-
rf" user={record.created_by.name}<BR/>updated_at={format_field_value(record.updated_at)}</FONT>>"
|
330
|
-
)
|
331
|
-
else:
|
332
|
-
name = record.__getattribute__(field)
|
333
|
-
return (
|
334
|
-
rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
335
|
-
rf' FACE="Monospace">uid={record.uid}</FONT>>'
|
336
|
-
)
|
337
|
-
|
338
|
-
|
339
|
-
def _add_emoji(record: Record, label: str):
|
340
|
-
if record.__class__.__name__ == "Transform":
|
341
|
-
emoji = TRANSFORM_EMOJIS.get(record.type, "💫")
|
342
|
-
elif record.__class__.__name__ == "Run":
|
343
|
-
emoji = TRANSFORM_EMOJIS.get(record.transform.type, "💫")
|
344
|
-
else:
|
345
|
-
emoji = ""
|
346
|
-
return f"{emoji} {label}"
|
347
|
-
|
348
|
-
|
349
|
-
def _get_all_parent_runs(data: Artifact | Collection) -> list:
|
350
|
-
"""Get all input file/collection runs recursively."""
|
351
|
-
name = data._meta.model_name
|
352
|
-
run_inputs_outputs = []
|
353
|
-
|
354
|
-
runs = [data.run] if data.run is not None else []
|
355
|
-
while len(runs) > 0:
|
356
|
-
inputs = []
|
357
|
-
for r in runs:
|
358
|
-
inputs_run = (
|
359
|
-
r.__getattribute__(f"input_{name}s")
|
360
|
-
.all()
|
361
|
-
.filter(visibility__in=[0, 1])
|
362
|
-
.list()
|
363
|
-
)
|
364
|
-
if name == "artifact":
|
365
|
-
inputs_run += (
|
366
|
-
r.input_collections.all().filter(visibility__in=[0, 1]).list()
|
367
|
-
)
|
368
|
-
run_inputs_outputs += [(inputs_run, r)]
|
369
|
-
outputs_run = (
|
370
|
-
r.__getattribute__(f"output_{name}s")
|
371
|
-
.all()
|
372
|
-
.filter(visibility__in=[0, 1])
|
373
|
-
.list()
|
374
|
-
)
|
375
|
-
if name == "artifact":
|
376
|
-
outputs_run += (
|
377
|
-
r.output_collections.all().filter(visibility__in=[0, 1]).list()
|
378
|
-
)
|
379
|
-
run_inputs_outputs += [(r, outputs_run)]
|
380
|
-
inputs += inputs_run
|
381
|
-
runs = [f.run for f in inputs if f.run is not None]
|
382
|
-
return run_inputs_outputs
|
383
|
-
|
384
|
-
|
385
|
-
def _get_all_child_runs(data: Artifact | Collection) -> list:
|
386
|
-
"""Get all output file/collection runs recursively."""
|
387
|
-
name = data._meta.model_name
|
388
|
-
all_runs: set[Run] = set()
|
389
|
-
run_inputs_outputs = []
|
390
|
-
|
391
|
-
if data.run is not None:
|
392
|
-
runs = {f.run for f in data.run.__getattribute__(f"output_{name}s").all()}
|
393
|
-
else:
|
394
|
-
runs = set()
|
395
|
-
if name == "artifact" and data.run is not None:
|
396
|
-
runs.update(
|
397
|
-
{
|
398
|
-
f.run
|
399
|
-
for f in data.run.output_collections.all()
|
400
|
-
.filter(visibility__in=[0, 1])
|
401
|
-
.all()
|
402
|
-
}
|
403
|
-
)
|
404
|
-
while runs.difference(all_runs):
|
405
|
-
all_runs.update(runs)
|
406
|
-
child_runs: set[Run] = set()
|
407
|
-
for r in runs:
|
408
|
-
inputs_run = (
|
409
|
-
r.__getattribute__(f"input_{name}s")
|
410
|
-
.all()
|
411
|
-
.filter(visibility__in=[0, 1])
|
412
|
-
.list()
|
413
|
-
)
|
414
|
-
if name == "artifact":
|
415
|
-
inputs_run += (
|
416
|
-
r.input_collections.all().filter(visibility__in=[0, 1]).list()
|
417
|
-
)
|
418
|
-
run_inputs_outputs += [(inputs_run, r)]
|
419
|
-
|
420
|
-
outputs_run = (
|
421
|
-
r.__getattribute__(f"output_{name}s")
|
422
|
-
.all()
|
423
|
-
.filter(visibility__in=[0, 1])
|
424
|
-
.list()
|
425
|
-
)
|
426
|
-
if name == "artifact":
|
427
|
-
outputs_run += (
|
428
|
-
r.output_collections.all().filter(visibility__in=[0, 1]).list()
|
429
|
-
)
|
430
|
-
run_inputs_outputs += [(r, outputs_run)]
|
431
|
-
|
432
|
-
child_runs.update(
|
433
|
-
Run.filter(
|
434
|
-
**{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
|
435
|
-
).list()
|
436
|
-
)
|
437
|
-
# for artifacts, also include collections in the lineage
|
438
|
-
if name == "artifact":
|
439
|
-
child_runs.update(
|
440
|
-
Run.filter(
|
441
|
-
input_collections__uid__in=[i.uid for i in outputs_run]
|
442
|
-
).list()
|
443
|
-
)
|
444
|
-
runs = child_runs
|
445
|
-
return run_inputs_outputs
|
446
|
-
|
447
|
-
|
448
|
-
def _df_edges_from_runs(df_values: list):
|
449
|
-
import pandas as pd
|
450
|
-
|
451
|
-
df = pd.DataFrame(df_values, columns=["source_record", "target_record"])
|
452
|
-
df = df.explode("source_record")
|
453
|
-
df = df.explode("target_record")
|
454
|
-
df = df.drop_duplicates().dropna()
|
455
|
-
df["source"] = [f"{i._meta.model_name}_{i.uid}" for i in df["source_record"]]
|
456
|
-
df["target"] = [f"{i._meta.model_name}_{i.uid}" for i in df["target_record"]]
|
457
|
-
df["source_label"] = df["source_record"].apply(_record_label)
|
458
|
-
df["target_label"] = df["target_record"].apply(_record_label)
|
459
|
-
return df
|
460
|
-
|
461
|
-
|
462
|
-
METHOD_NAMES = [
|
463
|
-
"view_parents",
|
464
|
-
]
|
465
|
-
|
466
|
-
if ln_setup._TESTING: # type: ignore
|
467
|
-
from inspect import signature
|
468
|
-
|
469
|
-
SIGS = {
|
470
|
-
name: signature(getattr(HasParents, name))
|
471
|
-
for name in METHOD_NAMES
|
472
|
-
if not name.startswith("__")
|
473
|
-
}
|
474
|
-
|
475
|
-
for name in METHOD_NAMES:
|
476
|
-
attach_func_to_class_method(name, HasParents, globals())
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import builtins
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
5
|
+
|
6
|
+
import lamindb_setup as ln_setup
|
7
|
+
from lamin_utils import logger
|
8
|
+
from lnschema_core import Artifact, Collection, Record, Run, Transform
|
9
|
+
from lnschema_core.models import HasParents, format_field_value
|
10
|
+
|
11
|
+
from lamindb._utils import attach_func_to_class_method
|
12
|
+
|
13
|
+
from ._record import get_name_field
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from lnschema_core.types import StrField
|
17
|
+
|
18
|
+
LAMIN_GREEN_LIGHTER = "#10b981"
|
19
|
+
LAMIN_GREEN_DARKER = "#065f46"
|
20
|
+
GREEN_FILL = "honeydew"
|
21
|
+
TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
|
22
|
+
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
23
|
+
|
24
|
+
|
25
|
+
def _transform_emoji(transform: Transform):
|
26
|
+
if transform is not None:
|
27
|
+
return TRANSFORM_EMOJIS.get(transform.type, "💫")
|
28
|
+
else:
|
29
|
+
return TRANSFORM_EMOJIS["pipeline"]
|
30
|
+
|
31
|
+
|
32
|
+
def _view(u):
|
33
|
+
from graphviz.backend import ExecutableNotFound
|
34
|
+
|
35
|
+
try:
|
36
|
+
if is_run_from_ipython:
|
37
|
+
from IPython import get_ipython
|
38
|
+
from IPython.display import display
|
39
|
+
|
40
|
+
# True if the code is running in a Jupyter Notebook or Lab environment
|
41
|
+
if get_ipython().__class__.__name__ == "TerminalInteractiveShell":
|
42
|
+
return u.view()
|
43
|
+
else:
|
44
|
+
# call u._repr_mimebundle_() manually that exception gets raised properly and not just printed by
|
45
|
+
# call to display()
|
46
|
+
display(u._repr_mimebundle_(), raw=True)
|
47
|
+
else:
|
48
|
+
return u
|
49
|
+
except (FileNotFoundError, RuntimeError, ExecutableNotFound): # pragma: no cover
|
50
|
+
logger.error(
|
51
|
+
"please install the graphviz executable on your system:\n - Ubuntu: `sudo"
|
52
|
+
" apt-get install graphviz`\n - Windows:"
|
53
|
+
" https://graphviz.org/download/#windows\n - Mac: `brew install graphviz`"
|
54
|
+
)
|
55
|
+
|
56
|
+
|
57
|
+
def view_parents(
|
58
|
+
self,
|
59
|
+
field: StrField | None = None,
|
60
|
+
with_children: bool = False,
|
61
|
+
distance: int = 5,
|
62
|
+
):
|
63
|
+
if field is None:
|
64
|
+
field = get_name_field(self)
|
65
|
+
if not isinstance(field, str):
|
66
|
+
field = field.field.name
|
67
|
+
|
68
|
+
return _view_parents(
|
69
|
+
record=self, field=field, with_children=with_children, distance=distance
|
70
|
+
)
|
71
|
+
|
72
|
+
|
73
|
+
def view_lineage(data: Artifact | Collection, with_children: bool = True) -> None:
|
74
|
+
"""Graph of data flow.
|
75
|
+
|
76
|
+
Notes:
|
77
|
+
For more info, see use cases: :doc:`docs:data-flow`.
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> collection.view_lineage()
|
81
|
+
>>> artifact.view_lineage()
|
82
|
+
"""
|
83
|
+
import graphviz
|
84
|
+
|
85
|
+
df_values = _get_all_parent_runs(data)
|
86
|
+
if with_children:
|
87
|
+
df_values += _get_all_child_runs(data)
|
88
|
+
df_edges = _df_edges_from_runs(df_values)
|
89
|
+
|
90
|
+
data_label = _record_label(data)
|
91
|
+
|
92
|
+
def add_node(
|
93
|
+
record: Run | Artifact | Collection,
|
94
|
+
node_id: str,
|
95
|
+
node_label: str,
|
96
|
+
u: graphviz.Digraph,
|
97
|
+
):
|
98
|
+
if isinstance(record, Run):
|
99
|
+
fillcolor = "gainsboro"
|
100
|
+
else:
|
101
|
+
fillcolor = GREEN_FILL
|
102
|
+
u.node(
|
103
|
+
node_id,
|
104
|
+
label=node_label,
|
105
|
+
shape="box",
|
106
|
+
style="rounded,filled",
|
107
|
+
fillcolor=fillcolor,
|
108
|
+
)
|
109
|
+
|
110
|
+
u = graphviz.Digraph(
|
111
|
+
f"{data._meta.model_name}_{data.uid}",
|
112
|
+
node_attr={
|
113
|
+
"fillcolor": GREEN_FILL,
|
114
|
+
"color": LAMIN_GREEN_DARKER,
|
115
|
+
"fontname": "Helvetica",
|
116
|
+
"fontsize": "10",
|
117
|
+
},
|
118
|
+
edge_attr={"arrowsize": "0.5"},
|
119
|
+
)
|
120
|
+
|
121
|
+
for _, row in df_edges.iterrows():
|
122
|
+
add_node(row["source_record"], row["source"], row["source_label"], u)
|
123
|
+
if row["target_record"] not in df_edges["source_record"]:
|
124
|
+
add_node(row["target_record"], row["target"], row["target_label"], u)
|
125
|
+
|
126
|
+
u.edge(row["source"], row["target"], color="dimgrey")
|
127
|
+
# label the searched file
|
128
|
+
u.node(
|
129
|
+
f"{data._meta.model_name}_{data.uid}",
|
130
|
+
label=data_label,
|
131
|
+
style="rounded,filled",
|
132
|
+
fillcolor=LAMIN_GREEN_LIGHTER,
|
133
|
+
shape="box",
|
134
|
+
)
|
135
|
+
|
136
|
+
_view(u)
|
137
|
+
|
138
|
+
|
139
|
+
def _view_parents(
|
140
|
+
record: Record,
|
141
|
+
field: str,
|
142
|
+
with_children: bool = False,
|
143
|
+
distance: int = 100,
|
144
|
+
attr_name: Literal["parents", "predecessors"] = "parents",
|
145
|
+
):
|
146
|
+
"""Graph of parents."""
|
147
|
+
if not hasattr(record, attr_name):
|
148
|
+
raise NotImplementedError(
|
149
|
+
f"Parents view is not supported for {record.__class__.__name__}!"
|
150
|
+
)
|
151
|
+
import graphviz
|
152
|
+
import pandas as pd
|
153
|
+
|
154
|
+
df_edges = None
|
155
|
+
df_edges_parents = _df_edges_from_parents(
|
156
|
+
record=record, field=field, distance=distance, attr_name=attr_name
|
157
|
+
)
|
158
|
+
if df_edges_parents is not None:
|
159
|
+
df_edges = df_edges_parents
|
160
|
+
if with_children:
|
161
|
+
df_edges_children = _df_edges_from_parents(
|
162
|
+
record=record,
|
163
|
+
field=field,
|
164
|
+
distance=distance,
|
165
|
+
children=True,
|
166
|
+
attr_name=attr_name,
|
167
|
+
)
|
168
|
+
if df_edges_children is not None:
|
169
|
+
if df_edges is not None:
|
170
|
+
df_edges = pd.concat(
|
171
|
+
[df_edges_parents, df_edges_children]
|
172
|
+
).drop_duplicates()
|
173
|
+
else:
|
174
|
+
df_edges = df_edges_children
|
175
|
+
|
176
|
+
record_label = _record_label(record, field)
|
177
|
+
|
178
|
+
u = graphviz.Digraph(
|
179
|
+
record.uid,
|
180
|
+
node_attr={
|
181
|
+
"color": LAMIN_GREEN_DARKER,
|
182
|
+
"fillcolor": GREEN_FILL,
|
183
|
+
"shape": "box",
|
184
|
+
"style": "rounded,filled",
|
185
|
+
"fontname": "Helvetica",
|
186
|
+
"fontsize": "10",
|
187
|
+
},
|
188
|
+
edge_attr={"arrowsize": "0.5"},
|
189
|
+
)
|
190
|
+
u.node(
|
191
|
+
record.uid,
|
192
|
+
label=(
|
193
|
+
_record_label(record)
|
194
|
+
if record.__class__.__name__ == "Transform"
|
195
|
+
else _add_emoji(record, record_label)
|
196
|
+
),
|
197
|
+
fillcolor=LAMIN_GREEN_LIGHTER,
|
198
|
+
)
|
199
|
+
if df_edges is not None:
|
200
|
+
for _, row in df_edges.iterrows():
|
201
|
+
u.node(row["source"], label=row["source_label"])
|
202
|
+
u.node(row["target"], label=row["target_label"])
|
203
|
+
u.edge(row["source"], row["target"], color="dimgrey")
|
204
|
+
|
205
|
+
_view(u)
|
206
|
+
|
207
|
+
|
208
|
+
def _get_parents(
|
209
|
+
record: Record,
|
210
|
+
field: str,
|
211
|
+
distance: int,
|
212
|
+
children: bool = False,
|
213
|
+
attr_name: Literal["parents", "predecessors"] = "parents",
|
214
|
+
):
|
215
|
+
"""Recursively get parent records within a distance."""
|
216
|
+
if children:
|
217
|
+
key = attr_name
|
218
|
+
else:
|
219
|
+
key = "children" if attr_name == "parents" else "successors" # type: ignore
|
220
|
+
model = record.__class__
|
221
|
+
condition = f"{key}__{field}"
|
222
|
+
results = model.filter(**{condition: record.__getattribute__(field)}).all()
|
223
|
+
if distance < 2:
|
224
|
+
return results
|
225
|
+
|
226
|
+
d = 2
|
227
|
+
while d < distance:
|
228
|
+
condition = f"{key}__{condition}"
|
229
|
+
records = model.filter(**{condition: record.__getattribute__(field)})
|
230
|
+
|
231
|
+
try:
|
232
|
+
if not records.exists():
|
233
|
+
return results
|
234
|
+
|
235
|
+
results = results | records.all()
|
236
|
+
d += 1
|
237
|
+
except Exception:
|
238
|
+
# For OperationalError:
|
239
|
+
# SQLite does not support joins containing more than 64 tables
|
240
|
+
return results
|
241
|
+
return results
|
242
|
+
|
243
|
+
|
244
|
+
def _df_edges_from_parents(
|
245
|
+
record: Record,
|
246
|
+
field: str,
|
247
|
+
distance: int,
|
248
|
+
children: bool = False,
|
249
|
+
attr_name: Literal["parents", "predecessors"] = "parents",
|
250
|
+
):
|
251
|
+
"""Construct a DataFrame of edges as the input of graphviz.Digraph."""
|
252
|
+
if attr_name == "parents":
|
253
|
+
key = "children" if children else "parents"
|
254
|
+
else:
|
255
|
+
key = "successors" if children else "predecessors"
|
256
|
+
parents = _get_parents(
|
257
|
+
record=record,
|
258
|
+
field=field,
|
259
|
+
distance=distance,
|
260
|
+
children=children,
|
261
|
+
attr_name=attr_name,
|
262
|
+
)
|
263
|
+
all = record.__class__.objects
|
264
|
+
records = parents | all.filter(id=record.id)
|
265
|
+
df = records.distinct().df(include=[f"{key}__id"])
|
266
|
+
if f"{key}__id" not in df.columns:
|
267
|
+
return None
|
268
|
+
df_edges = df[[f"{key}__id"]]
|
269
|
+
df_edges = df_edges.explode(f"{key}__id")
|
270
|
+
df_edges.index.name = "target"
|
271
|
+
df_edges = df_edges.reset_index()
|
272
|
+
df_edges.dropna(axis=0, inplace=True)
|
273
|
+
df_edges.rename(columns={f"{key}__id": "source"}, inplace=True)
|
274
|
+
df_edges = df_edges.drop_duplicates()
|
275
|
+
|
276
|
+
# colons messes with the node formatting:
|
277
|
+
# https://graphviz.readthedocs.io/en/stable/node_ports.html
|
278
|
+
df_edges["source_record"] = df_edges["source"].apply(lambda x: all.get(id=x))
|
279
|
+
df_edges["target_record"] = df_edges["target"].apply(lambda x: all.get(id=x))
|
280
|
+
if record.__class__.__name__ == "Transform":
|
281
|
+
df_edges["source_label"] = df_edges["source_record"].apply(_record_label)
|
282
|
+
df_edges["target_label"] = df_edges["target_record"].apply(_record_label)
|
283
|
+
else:
|
284
|
+
df_edges["source_label"] = df_edges["source_record"].apply(
|
285
|
+
lambda x: _record_label(x, field)
|
286
|
+
)
|
287
|
+
df_edges["target_label"] = df_edges["target_record"].apply(
|
288
|
+
lambda x: _record_label(x, field)
|
289
|
+
)
|
290
|
+
df_edges["source"] = df_edges["source_record"].apply(lambda x: x.uid)
|
291
|
+
df_edges["target"] = df_edges["target_record"].apply(lambda x: x.uid)
|
292
|
+
return df_edges
|
293
|
+
|
294
|
+
|
295
|
+
def _record_label(record: Record, field: str | None = None):
|
296
|
+
if isinstance(record, Artifact):
|
297
|
+
if record.description is None:
|
298
|
+
name = record.key
|
299
|
+
else:
|
300
|
+
name = record.description.replace("&", "&")
|
301
|
+
|
302
|
+
return (
|
303
|
+
rf'<📄 {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
304
|
+
rf' FACE="Monospace">uid={record.uid}<BR/>suffix={record.suffix}</FONT>>'
|
305
|
+
)
|
306
|
+
elif isinstance(record, Collection):
|
307
|
+
name = record.name.replace("&", "&")
|
308
|
+
return (
|
309
|
+
rf'<🍱 {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
310
|
+
rf' FACE="Monospace">uid={record.uid}<BR/>version={record.version}</FONT>>'
|
311
|
+
)
|
312
|
+
elif isinstance(record, Run):
|
313
|
+
name = f'{record.transform.name.replace("&", "&")}'
|
314
|
+
user_display = (
|
315
|
+
record.created_by.handle
|
316
|
+
if record.created_by.name is None
|
317
|
+
else record.created_by.name
|
318
|
+
)
|
319
|
+
return (
|
320
|
+
rf'<{TRANSFORM_EMOJIS.get(str(record.transform.type), "💫")} {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
321
|
+
rf' FACE="Monospace">uid={record.transform.uid}<BR/>type={record.transform.type},'
|
322
|
+
rf" user={user_display}<BR/>run={format_field_value(record.started_at)}</FONT>>"
|
323
|
+
)
|
324
|
+
elif isinstance(record, Transform):
|
325
|
+
name = f'{record.name.replace("&", "&")}'
|
326
|
+
return (
|
327
|
+
rf'<{TRANSFORM_EMOJIS.get(str(record.type), "💫")} {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
328
|
+
rf' FACE="Monospace">uid={record.uid}<BR/>type={record.type},'
|
329
|
+
rf" user={record.created_by.name}<BR/>updated_at={format_field_value(record.updated_at)}</FONT>>"
|
330
|
+
)
|
331
|
+
else:
|
332
|
+
name = record.__getattribute__(field)
|
333
|
+
return (
|
334
|
+
rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
335
|
+
rf' FACE="Monospace">uid={record.uid}</FONT>>'
|
336
|
+
)
|
337
|
+
|
338
|
+
|
339
|
+
def _add_emoji(record: Record, label: str):
|
340
|
+
if record.__class__.__name__ == "Transform":
|
341
|
+
emoji = TRANSFORM_EMOJIS.get(record.type, "💫")
|
342
|
+
elif record.__class__.__name__ == "Run":
|
343
|
+
emoji = TRANSFORM_EMOJIS.get(record.transform.type, "💫")
|
344
|
+
else:
|
345
|
+
emoji = ""
|
346
|
+
return f"{emoji} {label}"
|
347
|
+
|
348
|
+
|
349
|
+
def _get_all_parent_runs(data: Artifact | Collection) -> list:
|
350
|
+
"""Get all input file/collection runs recursively."""
|
351
|
+
name = data._meta.model_name
|
352
|
+
run_inputs_outputs = []
|
353
|
+
|
354
|
+
runs = [data.run] if data.run is not None else []
|
355
|
+
while len(runs) > 0:
|
356
|
+
inputs = []
|
357
|
+
for r in runs:
|
358
|
+
inputs_run = (
|
359
|
+
r.__getattribute__(f"input_{name}s")
|
360
|
+
.all()
|
361
|
+
.filter(visibility__in=[0, 1])
|
362
|
+
.list()
|
363
|
+
)
|
364
|
+
if name == "artifact":
|
365
|
+
inputs_run += (
|
366
|
+
r.input_collections.all().filter(visibility__in=[0, 1]).list()
|
367
|
+
)
|
368
|
+
run_inputs_outputs += [(inputs_run, r)]
|
369
|
+
outputs_run = (
|
370
|
+
r.__getattribute__(f"output_{name}s")
|
371
|
+
.all()
|
372
|
+
.filter(visibility__in=[0, 1])
|
373
|
+
.list()
|
374
|
+
)
|
375
|
+
if name == "artifact":
|
376
|
+
outputs_run += (
|
377
|
+
r.output_collections.all().filter(visibility__in=[0, 1]).list()
|
378
|
+
)
|
379
|
+
run_inputs_outputs += [(r, outputs_run)]
|
380
|
+
inputs += inputs_run
|
381
|
+
runs = [f.run for f in inputs if f.run is not None]
|
382
|
+
return run_inputs_outputs
|
383
|
+
|
384
|
+
|
385
|
+
def _get_all_child_runs(data: Artifact | Collection) -> list:
|
386
|
+
"""Get all output file/collection runs recursively."""
|
387
|
+
name = data._meta.model_name
|
388
|
+
all_runs: set[Run] = set()
|
389
|
+
run_inputs_outputs = []
|
390
|
+
|
391
|
+
if data.run is not None:
|
392
|
+
runs = {f.run for f in data.run.__getattribute__(f"output_{name}s").all()}
|
393
|
+
else:
|
394
|
+
runs = set()
|
395
|
+
if name == "artifact" and data.run is not None:
|
396
|
+
runs.update(
|
397
|
+
{
|
398
|
+
f.run
|
399
|
+
for f in data.run.output_collections.all()
|
400
|
+
.filter(visibility__in=[0, 1])
|
401
|
+
.all()
|
402
|
+
}
|
403
|
+
)
|
404
|
+
while runs.difference(all_runs):
|
405
|
+
all_runs.update(runs)
|
406
|
+
child_runs: set[Run] = set()
|
407
|
+
for r in runs:
|
408
|
+
inputs_run = (
|
409
|
+
r.__getattribute__(f"input_{name}s")
|
410
|
+
.all()
|
411
|
+
.filter(visibility__in=[0, 1])
|
412
|
+
.list()
|
413
|
+
)
|
414
|
+
if name == "artifact":
|
415
|
+
inputs_run += (
|
416
|
+
r.input_collections.all().filter(visibility__in=[0, 1]).list()
|
417
|
+
)
|
418
|
+
run_inputs_outputs += [(inputs_run, r)]
|
419
|
+
|
420
|
+
outputs_run = (
|
421
|
+
r.__getattribute__(f"output_{name}s")
|
422
|
+
.all()
|
423
|
+
.filter(visibility__in=[0, 1])
|
424
|
+
.list()
|
425
|
+
)
|
426
|
+
if name == "artifact":
|
427
|
+
outputs_run += (
|
428
|
+
r.output_collections.all().filter(visibility__in=[0, 1]).list()
|
429
|
+
)
|
430
|
+
run_inputs_outputs += [(r, outputs_run)]
|
431
|
+
|
432
|
+
child_runs.update(
|
433
|
+
Run.filter(
|
434
|
+
**{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
|
435
|
+
).list()
|
436
|
+
)
|
437
|
+
# for artifacts, also include collections in the lineage
|
438
|
+
if name == "artifact":
|
439
|
+
child_runs.update(
|
440
|
+
Run.filter(
|
441
|
+
input_collections__uid__in=[i.uid for i in outputs_run]
|
442
|
+
).list()
|
443
|
+
)
|
444
|
+
runs = child_runs
|
445
|
+
return run_inputs_outputs
|
446
|
+
|
447
|
+
|
448
|
+
def _df_edges_from_runs(df_values: list):
|
449
|
+
import pandas as pd
|
450
|
+
|
451
|
+
df = pd.DataFrame(df_values, columns=["source_record", "target_record"])
|
452
|
+
df = df.explode("source_record")
|
453
|
+
df = df.explode("target_record")
|
454
|
+
df = df.drop_duplicates().dropna()
|
455
|
+
df["source"] = [f"{i._meta.model_name}_{i.uid}" for i in df["source_record"]]
|
456
|
+
df["target"] = [f"{i._meta.model_name}_{i.uid}" for i in df["target_record"]]
|
457
|
+
df["source_label"] = df["source_record"].apply(_record_label)
|
458
|
+
df["target_label"] = df["target_record"].apply(_record_label)
|
459
|
+
return df
|
460
|
+
|
461
|
+
|
462
|
+
METHOD_NAMES = [
|
463
|
+
"view_parents",
|
464
|
+
]
|
465
|
+
|
466
|
+
if ln_setup._TESTING: # type: ignore
|
467
|
+
from inspect import signature
|
468
|
+
|
469
|
+
SIGS = {
|
470
|
+
name: signature(getattr(HasParents, name))
|
471
|
+
for name in METHOD_NAMES
|
472
|
+
if not name.startswith("__")
|
473
|
+
}
|
474
|
+
|
475
|
+
for name in METHOD_NAMES:
|
476
|
+
attach_func_to_class_method(name, HasParents, globals())
|