lamindb 0.69.8__py3-none-any.whl → 0.69.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_annotate.py +461 -126
- lamindb/_artifact.py +50 -16
- lamindb/_can_validate.py +13 -18
- lamindb/_collection.py +35 -43
- lamindb/_feature_set.py +20 -8
- lamindb/_from_values.py +21 -16
- lamindb/_registry.py +7 -2
- lamindb/core/__init__.py +16 -4
- lamindb/core/_data.py +5 -16
- lamindb/core/_feature_manager.py +74 -25
- lamindb/core/_label_manager.py +1 -1
- lamindb/core/_mapped_collection.py +106 -52
- lamindb/core/datasets/_core.py +41 -1
- lamindb/core/storage/_backed_access.py +8 -4
- lamindb/core/storage/file.py +12 -0
- lamindb/core/storage/object.py +19 -0
- lamindb/core/types.py +0 -2
- lamindb/setup/core/__init__.py +3 -14
- {lamindb-0.69.8.dist-info → lamindb-0.69.10.dist-info}/METADATA +7 -8
- {lamindb-0.69.8.dist-info → lamindb-0.69.10.dist-info}/RECORD +23 -23
- {lamindb-0.69.8.dist-info → lamindb-0.69.10.dist-info}/LICENSE +0 -0
- {lamindb-0.69.8.dist-info → lamindb-0.69.10.dist-info}/WHEEL +0 -0
lamindb/_artifact.py
CHANGED
@@ -22,7 +22,6 @@ from lamindb_setup.core.upath import (
|
|
22
22
|
from lnschema_core import Artifact, Run, Storage
|
23
23
|
from lnschema_core.models import IsTree
|
24
24
|
from lnschema_core.types import (
|
25
|
-
DataLike,
|
26
25
|
VisibilityChoice,
|
27
26
|
)
|
28
27
|
|
@@ -52,9 +51,11 @@ from .core._data import (
|
|
52
51
|
save_feature_sets,
|
53
52
|
)
|
54
53
|
from .core.storage.file import AUTO_KEY_PREFIX
|
54
|
+
from .core.storage.object import _mudata_is_installed
|
55
55
|
|
56
56
|
if TYPE_CHECKING:
|
57
57
|
from lamindb_setup.core.types import UPathStr
|
58
|
+
from mudata import MuData
|
58
59
|
|
59
60
|
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
60
61
|
|
@@ -114,7 +115,7 @@ def process_pathlike(
|
|
114
115
|
|
115
116
|
def process_data(
|
116
117
|
provisional_uid: str,
|
117
|
-
data: UPathStr |
|
118
|
+
data: UPathStr | pd.DataFrame | AnnData,
|
118
119
|
format: str | None,
|
119
120
|
key: str | None,
|
120
121
|
default_storage: Storage,
|
@@ -123,6 +124,13 @@ def process_data(
|
|
123
124
|
) -> tuple[Any, Path | UPath, str, Storage, bool]:
|
124
125
|
"""Serialize a data object that's provided as file or in memory."""
|
125
126
|
# if not overwritten, data gets stored in default storage
|
127
|
+
if _mudata_is_installed():
|
128
|
+
from mudata import MuData
|
129
|
+
|
130
|
+
data_types = (pd.DataFrame, AnnData, MuData)
|
131
|
+
else:
|
132
|
+
data_types = (pd.DataFrame, AnnData) # type:ignore
|
133
|
+
|
126
134
|
if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
|
127
135
|
access_token = (
|
128
136
|
default_storage._access_token
|
@@ -138,7 +146,7 @@ def process_data(
|
|
138
146
|
)
|
139
147
|
suffix = extract_suffix_from_path(path)
|
140
148
|
memory_rep = None
|
141
|
-
elif isinstance(data,
|
149
|
+
elif isinstance(data, data_types):
|
142
150
|
storage = default_storage
|
143
151
|
memory_rep = data
|
144
152
|
if key is not None:
|
@@ -297,7 +305,7 @@ def get_relative_path_to_directory(
|
|
297
305
|
|
298
306
|
def get_artifact_kwargs_from_data(
|
299
307
|
*,
|
300
|
-
data: Path | UPath | str | pd.DataFrame | AnnData,
|
308
|
+
data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
|
301
309
|
key: str | None,
|
302
310
|
run: Run | None,
|
303
311
|
format: str | None,
|
@@ -427,22 +435,20 @@ def log_storage_hint(
|
|
427
435
|
logger.hint(hint)
|
428
436
|
|
429
437
|
|
430
|
-
def data_is_anndata(data:
|
438
|
+
def data_is_anndata(data: AnnData | UPathStr):
|
431
439
|
if isinstance(data, AnnData):
|
432
440
|
return True
|
433
441
|
if isinstance(data, (str, Path, UPath)):
|
434
442
|
return Path(data).suffix in {".h5ad", ".zrad"}
|
435
|
-
return False
|
443
|
+
return False
|
436
444
|
|
437
445
|
|
438
|
-
def data_is_mudata(data:
|
439
|
-
|
446
|
+
def data_is_mudata(data: MuData | UPathStr):
|
447
|
+
if _mudata_is_installed():
|
440
448
|
from mudata import MuData
|
441
|
-
except ModuleNotFoundError:
|
442
|
-
return False
|
443
449
|
|
444
|
-
|
445
|
-
|
450
|
+
if isinstance(data, MuData):
|
451
|
+
return True
|
446
452
|
if isinstance(data, (str, Path, UPath)):
|
447
453
|
return Path(data).suffix in {".h5mu"}
|
448
454
|
return False
|
@@ -456,6 +462,9 @@ def _check_accessor_artifact(data: Any, accessor: str | None = None):
|
|
456
462
|
elif data_is_anndata(data):
|
457
463
|
logger.warning("data is an AnnData, please use .from_anndata()")
|
458
464
|
accessor = "AnnData"
|
465
|
+
elif data_is_mudata(data):
|
466
|
+
logger.warning("data is a MuData, please use .from_mudata()")
|
467
|
+
accessor = "MuData"
|
459
468
|
else:
|
460
469
|
raise TypeError("data has to be a string, Path, UPath")
|
461
470
|
return accessor
|
@@ -620,6 +629,32 @@ def from_anndata(
|
|
620
629
|
return artifact
|
621
630
|
|
622
631
|
|
632
|
+
@classmethod # type: ignore
|
633
|
+
@doc_args(Artifact.from_mudata.__doc__)
|
634
|
+
def from_mudata(
|
635
|
+
cls,
|
636
|
+
mdata: MuData,
|
637
|
+
key: str | None = None,
|
638
|
+
description: str | None = None,
|
639
|
+
run: Run | None = None,
|
640
|
+
version: str | None = None,
|
641
|
+
is_new_version_of: Artifact | None = None,
|
642
|
+
**kwargs,
|
643
|
+
) -> Artifact:
|
644
|
+
"""{}."""
|
645
|
+
artifact = Artifact(
|
646
|
+
data=mdata,
|
647
|
+
key=key,
|
648
|
+
run=run,
|
649
|
+
description=description,
|
650
|
+
version=version,
|
651
|
+
is_new_version_of=is_new_version_of,
|
652
|
+
accessor="MuData",
|
653
|
+
**kwargs,
|
654
|
+
)
|
655
|
+
return artifact
|
656
|
+
|
657
|
+
|
623
658
|
@classmethod # type: ignore
|
624
659
|
@doc_args(Artifact.from_dir.__doc__)
|
625
660
|
def from_dir(
|
@@ -725,7 +760,7 @@ def from_dir(
|
|
725
760
|
# docstring handled through attach_func_to_class_method
|
726
761
|
def replace(
|
727
762
|
self,
|
728
|
-
data: UPathStr
|
763
|
+
data: UPathStr,
|
729
764
|
run: Run | None = None,
|
730
765
|
format: str | None = None,
|
731
766
|
) -> None:
|
@@ -808,9 +843,7 @@ def backed(self, is_run_input: bool | None = None) -> AnnDataAccessor | BackedAc
|
|
808
843
|
|
809
844
|
|
810
845
|
# docstring handled through attach_func_to_class_method
|
811
|
-
def load(
|
812
|
-
self, is_run_input: bool | None = None, stream: bool = False, **kwargs
|
813
|
-
) -> DataLike:
|
846
|
+
def load(self, is_run_input: bool | None = None, stream: bool = False, **kwargs) -> Any:
|
814
847
|
_track_run_input(self, is_run_input)
|
815
848
|
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
816
849
|
return self._memory_rep
|
@@ -963,6 +996,7 @@ METHOD_NAMES = [
|
|
963
996
|
"__init__",
|
964
997
|
"from_anndata",
|
965
998
|
"from_df",
|
999
|
+
"from_mudata",
|
966
1000
|
"backed",
|
967
1001
|
"stage",
|
968
1002
|
"load",
|
lamindb/_can_validate.py
CHANGED
@@ -29,7 +29,7 @@ def inspect(
|
|
29
29
|
field: str | StrField | None = None,
|
30
30
|
*,
|
31
31
|
mute: bool = False,
|
32
|
-
|
32
|
+
organism: str | Registry | None = None,
|
33
33
|
) -> InspectResult:
|
34
34
|
"""{}."""
|
35
35
|
return _inspect(
|
@@ -37,7 +37,7 @@ def inspect(
|
|
37
37
|
values=values,
|
38
38
|
field=field,
|
39
39
|
mute=mute,
|
40
|
-
|
40
|
+
organism=organism,
|
41
41
|
)
|
42
42
|
|
43
43
|
|
@@ -49,10 +49,10 @@ def validate(
|
|
49
49
|
field: str | StrField | None = None,
|
50
50
|
*,
|
51
51
|
mute: bool = False,
|
52
|
-
|
52
|
+
organism: str | Registry | None = None,
|
53
53
|
) -> np.ndarray:
|
54
54
|
"""{}."""
|
55
|
-
return _validate(cls=cls, values=values, field=field, mute=mute,
|
55
|
+
return _validate(cls=cls, values=values, field=field, mute=mute, organism=organism)
|
56
56
|
|
57
57
|
|
58
58
|
def _inspect(
|
@@ -62,7 +62,7 @@ def _inspect(
|
|
62
62
|
*,
|
63
63
|
mute: bool = False,
|
64
64
|
using_key: str | None = None,
|
65
|
-
|
65
|
+
organism: str | Registry | None = None,
|
66
66
|
) -> pd.DataFrame | dict[str, list[str]]:
|
67
67
|
"""{}."""
|
68
68
|
from lamin_utils._inspect import inspect
|
@@ -77,20 +77,17 @@ def _inspect(
|
|
77
77
|
|
78
78
|
# inspect in the DB
|
79
79
|
result_db = inspect(
|
80
|
-
df=_filter_query_based_on_organism(
|
81
|
-
queryset=queryset, organism=kwargs.get("organism")
|
82
|
-
),
|
80
|
+
df=_filter_query_based_on_organism(queryset=queryset, organism=organism),
|
83
81
|
identifiers=values,
|
84
82
|
field=field,
|
85
83
|
mute=mute,
|
86
|
-
**kwargs,
|
87
84
|
)
|
88
85
|
nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
|
89
86
|
|
90
87
|
if len(nonval) > 0 and orm.__get_schema_name__() == "bionty":
|
91
88
|
try:
|
92
|
-
bionty_result = orm.public(organism=
|
93
|
-
values=nonval, field=field, mute=True
|
89
|
+
bionty_result = orm.public(organism=organism).inspect(
|
90
|
+
values=nonval, field=field, mute=True
|
94
91
|
)
|
95
92
|
bionty_validated = bionty_result.validated
|
96
93
|
bionty_mapper = bionty_result.synonyms_mapper
|
@@ -146,7 +143,7 @@ def _validate(
|
|
146
143
|
*,
|
147
144
|
mute: bool = False,
|
148
145
|
using_key: str | None = None,
|
149
|
-
|
146
|
+
organism: str | Registry | None = None,
|
150
147
|
) -> np.ndarray:
|
151
148
|
"""{}."""
|
152
149
|
from lamin_utils._inspect import validate
|
@@ -161,7 +158,7 @@ def _validate(
|
|
161
158
|
field_values = pd.Series(
|
162
159
|
_filter_query_based_on_organism(
|
163
160
|
queryset=queryset,
|
164
|
-
organism=
|
161
|
+
organism=organism,
|
165
162
|
values_list_field=field,
|
166
163
|
),
|
167
164
|
dtype="object",
|
@@ -173,7 +170,6 @@ def _validate(
|
|
173
170
|
case_sensitive=True,
|
174
171
|
mute=mute,
|
175
172
|
field=field,
|
176
|
-
**kwargs,
|
177
173
|
)
|
178
174
|
if return_str and len(result) == 1:
|
179
175
|
return result[0]
|
@@ -195,7 +191,7 @@ def standardize(
|
|
195
191
|
public_aware: bool = True,
|
196
192
|
keep: Literal["first", "last", False] = "first",
|
197
193
|
synonyms_field: str = "synonyms",
|
198
|
-
|
194
|
+
organism: str | Registry | None = None,
|
199
195
|
) -> list[str] | dict[str, str]:
|
200
196
|
"""{}."""
|
201
197
|
return _standardize(
|
@@ -209,7 +205,7 @@ def standardize(
|
|
209
205
|
public_aware=public_aware,
|
210
206
|
keep=keep,
|
211
207
|
synonyms_field=synonyms_field,
|
212
|
-
|
208
|
+
organism=organism,
|
213
209
|
)
|
214
210
|
|
215
211
|
|
@@ -258,7 +254,7 @@ def _standardize(
|
|
258
254
|
keep: Literal["first", "last", False] = "first",
|
259
255
|
synonyms_field: str = "synonyms",
|
260
256
|
using_key: str | None = None,
|
261
|
-
|
257
|
+
organism: str | Registry | None = None,
|
262
258
|
) -> list[str] | dict[str, str]:
|
263
259
|
"""{}."""
|
264
260
|
from lamin_utils._standardize import standardize as map_synonyms
|
@@ -274,7 +270,6 @@ def _standardize(
|
|
274
270
|
queryset = _queryset(cls, using_key)
|
275
271
|
orm = queryset.model
|
276
272
|
|
277
|
-
organism = kwargs.get("organism")
|
278
273
|
if _has_organism_field(orm):
|
279
274
|
# here, we can safely import lnschema_bionty
|
280
275
|
from lnschema_bionty._bionty import create_or_get_organism_record
|
lamindb/_collection.py
CHANGED
@@ -16,7 +16,7 @@ from lamin_utils import logger
|
|
16
16
|
from lamindb_setup.core._docs import doc_args
|
17
17
|
from lamindb_setup.core.hashing import hash_set
|
18
18
|
from lnschema_core.models import Collection, CollectionArtifact, FeatureSet
|
19
|
-
from lnschema_core.types import
|
19
|
+
from lnschema_core.types import VisibilityChoice
|
20
20
|
|
21
21
|
from lamindb._utils import attach_func_to_class_method
|
22
22
|
from lamindb.core._data import _track_run_input
|
@@ -40,17 +40,6 @@ if TYPE_CHECKING:
|
|
40
40
|
from ._query_set import QuerySet
|
41
41
|
|
42
42
|
|
43
|
-
def _check_accessor_collection(data: Any, accessor: str | None = None):
|
44
|
-
if accessor is None and isinstance(data, (AnnData, pd.DataFrame)):
|
45
|
-
if isinstance(data, pd.DataFrame):
|
46
|
-
logger.warning("data is a DataFrame, please use .from_df()")
|
47
|
-
accessor = "DataFrame"
|
48
|
-
elif data_is_anndata(data):
|
49
|
-
logger.warning("data is an AnnData, please use .from_anndata()")
|
50
|
-
accessor = "AnnData"
|
51
|
-
return accessor
|
52
|
-
|
53
|
-
|
54
43
|
def __init__(
|
55
44
|
collection: Collection,
|
56
45
|
*args,
|
@@ -61,11 +50,11 @@ def __init__(
|
|
61
50
|
return None
|
62
51
|
# now we proceed with the user-facing constructor
|
63
52
|
if len(args) > 1:
|
64
|
-
raise ValueError("Only one non-keyword arg allowed:
|
65
|
-
|
66
|
-
kwargs.pop("
|
53
|
+
raise ValueError("Only one non-keyword arg allowed: artifacts")
|
54
|
+
artifacts: Artifact | Iterable[Artifact] = (
|
55
|
+
kwargs.pop("artifacts") if len(args) == 0 else args[0]
|
67
56
|
)
|
68
|
-
meta:
|
57
|
+
meta: Artifact | None = kwargs.pop("meta") if "meta" in kwargs else None
|
69
58
|
name: str | None = kwargs.pop("name") if "name" in kwargs else None
|
70
59
|
description: str | None = (
|
71
60
|
kwargs.pop("description") if "description" in kwargs else None
|
@@ -87,14 +76,10 @@ def __init__(
|
|
87
76
|
feature_sets: dict[str, FeatureSet] = (
|
88
77
|
kwargs.pop("feature_sets") if "feature_sets" in kwargs else {}
|
89
78
|
)
|
90
|
-
accessor = kwargs.pop("accessor") if "accessor" in kwargs else None
|
91
|
-
if not isinstance(data, (Artifact, Iterable)):
|
92
|
-
accessor = _check_accessor_collection(data=data, accessor=accessor)
|
93
79
|
if not len(kwargs) == 0:
|
94
80
|
raise ValueError(
|
95
|
-
f"Only
|
81
|
+
f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
|
96
82
|
)
|
97
|
-
|
98
83
|
if is_new_version_of is None:
|
99
84
|
provisional_uid = init_uid(version=version, n_full_id=20)
|
100
85
|
else:
|
@@ -104,13 +89,13 @@ def __init__(
|
|
104
89
|
if name is None:
|
105
90
|
name = is_new_version_of.name
|
106
91
|
run = get_run(run)
|
107
|
-
if isinstance(
|
108
|
-
|
92
|
+
if isinstance(artifacts, Artifact):
|
93
|
+
artifacts = [artifacts]
|
109
94
|
else:
|
110
|
-
if not hasattr(
|
95
|
+
if not hasattr(artifacts, "__getitem__"):
|
111
96
|
raise ValueError("Artifact or List[Artifact] is allowed.")
|
112
|
-
assert isinstance(
|
113
|
-
hash, feature_sets = from_artifacts(
|
97
|
+
assert isinstance(artifacts[0], Artifact) # type: ignore
|
98
|
+
hash, feature_sets = from_artifacts(artifacts) # type: ignore
|
114
99
|
if meta is not None:
|
115
100
|
if not isinstance(meta, Artifact):
|
116
101
|
raise ValueError("meta has to be an Artifact")
|
@@ -153,12 +138,12 @@ def __init__(
|
|
153
138
|
visibility=visibility,
|
154
139
|
**kwargs,
|
155
140
|
)
|
156
|
-
collection._artifacts =
|
141
|
+
collection._artifacts = artifacts
|
157
142
|
collection._feature_sets = feature_sets
|
158
143
|
# register provenance
|
159
144
|
if is_new_version_of is not None:
|
160
145
|
_track_run_input(is_new_version_of, run=run)
|
161
|
-
_track_run_input(
|
146
|
+
_track_run_input(artifacts, run=run)
|
162
147
|
|
163
148
|
|
164
149
|
# internal function, not exposed to user
|
@@ -224,7 +209,9 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
224
209
|
# docstring handled through attach_func_to_class_method
|
225
210
|
def mapped(
|
226
211
|
self,
|
227
|
-
|
212
|
+
layers_keys: str | list[str] | None = None,
|
213
|
+
obs_keys: str | list[str] | None = None,
|
214
|
+
obsm_keys: str | list[str] | None = None,
|
228
215
|
join: Literal["inner", "outer"] | None = "inner",
|
229
216
|
encode_labels: bool | list[str] = True,
|
230
217
|
unknown_label: str | dict[str, str] | None = None,
|
@@ -245,7 +232,9 @@ def mapped(
|
|
245
232
|
path_list.append(artifact.path)
|
246
233
|
ds = MappedCollection(
|
247
234
|
path_list,
|
248
|
-
|
235
|
+
layers_keys,
|
236
|
+
obs_keys,
|
237
|
+
obsm_keys,
|
249
238
|
join,
|
250
239
|
encode_labels,
|
251
240
|
unknown_label,
|
@@ -273,7 +262,7 @@ def load(
|
|
273
262
|
join: Literal["inner", "outer"] = "outer",
|
274
263
|
is_run_input: bool | None = None,
|
275
264
|
**kwargs,
|
276
|
-
) ->
|
265
|
+
) -> Any:
|
277
266
|
# cannot call _track_run_input here, see comment further down
|
278
267
|
all_artifacts = self.artifacts.all()
|
279
268
|
suffixes = [artifact.suffix for artifact in all_artifacts]
|
@@ -321,7 +310,7 @@ def delete(self, permanent: bool | None = None) -> None:
|
|
321
310
|
|
322
311
|
|
323
312
|
# docstring handled through attach_func_to_class_method
|
324
|
-
def save(self,
|
313
|
+
def save(self, transfer_labels: bool = False, using: str | None = None) -> None:
|
325
314
|
if self.artifact is not None:
|
326
315
|
self.artifact.save()
|
327
316
|
# we don't need to save feature sets again
|
@@ -330,18 +319,21 @@ def save(self, *args, **kwargs) -> None:
|
|
330
319
|
# we don't allow updating the collection of artifacts
|
331
320
|
# if users want to update the set of artifacts, they
|
332
321
|
# have to create a new collection
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
# we need ignore_conflicts=True so that this won't error if links already exist
|
343
|
-
CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
|
322
|
+
links = [
|
323
|
+
CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
|
324
|
+
for artifact in self._artifacts
|
325
|
+
]
|
326
|
+
# the below seems to preserve the order of the list in the
|
327
|
+
# auto-incrementing integer primary
|
328
|
+
# merely using .unordered_artifacts.set(*...) doesn't achieve this
|
329
|
+
# we need ignore_conflicts=True so that this won't error if links already exist
|
330
|
+
CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
|
344
331
|
save_feature_set_links(self)
|
332
|
+
if using is not None:
|
333
|
+
logger.warning("using argument is ignored")
|
334
|
+
if transfer_labels:
|
335
|
+
for artifact in self._artifacts:
|
336
|
+
self.labels.add_from(artifact)
|
345
337
|
|
346
338
|
|
347
339
|
# docstring handled through attach_func_to_class_method
|
lamindb/_feature_set.py
CHANGED
@@ -162,7 +162,9 @@ def from_values(
|
|
162
162
|
field: FieldAttr = Feature.name,
|
163
163
|
type: str | None = None,
|
164
164
|
name: str | None = None,
|
165
|
-
|
165
|
+
mute: bool = False,
|
166
|
+
organism: Registry | str | None = None,
|
167
|
+
public_source: Registry | None = None,
|
166
168
|
) -> FeatureSet | None:
|
167
169
|
"""{}."""
|
168
170
|
if not isinstance(field, FieldAttr):
|
@@ -175,13 +177,18 @@ def from_values(
|
|
175
177
|
if registry != Feature and type is None:
|
176
178
|
type = NUMBER_TYPE
|
177
179
|
logger.debug("setting feature set to 'number'")
|
178
|
-
validated = registry.validate(values, field=field, organism=
|
180
|
+
validated = registry.validate(values, field=field, mute=mute, organism=organism)
|
179
181
|
if validated.sum() == 0:
|
180
|
-
if
|
182
|
+
if mute is True:
|
181
183
|
logger.warning("no validated features, skip creating feature set")
|
182
184
|
return None
|
183
185
|
validated_values = np.array(values)[validated]
|
184
|
-
validated_features = registry.from_values(
|
186
|
+
validated_features = registry.from_values(
|
187
|
+
validated_values,
|
188
|
+
field=field,
|
189
|
+
organism=organism,
|
190
|
+
public_source=public_source,
|
191
|
+
)
|
185
192
|
feature_set = FeatureSet(
|
186
193
|
features=validated_features,
|
187
194
|
name=name,
|
@@ -197,13 +204,15 @@ def from_df(
|
|
197
204
|
df: pd.DataFrame,
|
198
205
|
field: FieldAttr = Feature.name,
|
199
206
|
name: str | None = None,
|
200
|
-
|
207
|
+
mute: bool = False,
|
208
|
+
organism: Registry | str | None = None,
|
209
|
+
public_source: Registry | None = None,
|
201
210
|
) -> FeatureSet | None:
|
202
211
|
"""{}."""
|
203
212
|
registry = field.field.model
|
204
|
-
validated = registry.validate(df.columns, field=field,
|
213
|
+
validated = registry.validate(df.columns, field=field, mute=mute, organism=organism)
|
205
214
|
if validated.sum() == 0:
|
206
|
-
if
|
215
|
+
if mute is True:
|
207
216
|
logger.warning("no validated features, skip creating feature set")
|
208
217
|
return None
|
209
218
|
if registry == Feature:
|
@@ -215,7 +224,10 @@ def from_df(
|
|
215
224
|
raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
|
216
225
|
type = convert_numpy_dtype_to_lamin_feature_type(dtypes[0])
|
217
226
|
validated_features = registry.from_values(
|
218
|
-
df.columns[validated],
|
227
|
+
df.columns[validated],
|
228
|
+
field=field,
|
229
|
+
organism=organism,
|
230
|
+
public_source=public_source,
|
219
231
|
)
|
220
232
|
feature_set = FeatureSet(
|
221
233
|
features=validated_features,
|
lamindb/_from_values.py
CHANGED
@@ -19,19 +19,26 @@ def get_or_create_records(
|
|
19
19
|
field: StrField,
|
20
20
|
*,
|
21
21
|
from_public: bool = False,
|
22
|
-
|
22
|
+
organism: Registry | str | None = None,
|
23
|
+
public_source: Registry | None = None,
|
23
24
|
) -> list[Registry]:
|
24
25
|
"""Get or create records from iterables."""
|
25
26
|
upon_create_search_names = settings.upon_create_search_names
|
26
|
-
settings.upon_create_search_names = False
|
27
27
|
feature: Feature = None
|
28
|
+
organism = _get_organism_record(field, organism)
|
29
|
+
kwargs: dict = {}
|
30
|
+
if organism is not None:
|
31
|
+
kwargs["organism"] = organism
|
32
|
+
if public_source is not None:
|
33
|
+
kwargs["public_source"] = public_source
|
34
|
+
settings.upon_create_search_names = False
|
28
35
|
try:
|
29
36
|
Registry = field.field.model
|
30
37
|
iterable_idx = index_iterable(iterable)
|
31
38
|
|
32
39
|
# returns existing records & non-existing values
|
33
40
|
records, nonexist_values, msg = get_existing_records(
|
34
|
-
iterable_idx=iterable_idx, field=field, kwargs
|
41
|
+
iterable_idx=iterable_idx, field=field, **kwargs
|
35
42
|
)
|
36
43
|
|
37
44
|
# new records to be created based on new values
|
@@ -78,26 +85,14 @@ def get_or_create_records(
|
|
78
85
|
def get_existing_records(
|
79
86
|
iterable_idx: pd.Index,
|
80
87
|
field: StrField,
|
81
|
-
kwargs
|
88
|
+
**kwargs,
|
82
89
|
):
|
83
|
-
if kwargs is None:
|
84
|
-
kwargs = {}
|
85
90
|
model = field.field.model
|
86
91
|
condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
|
87
92
|
# existing records matching is agnostic to the bionty source
|
88
93
|
if "public_source" in condition:
|
89
94
|
condition.pop("public_source")
|
90
95
|
|
91
|
-
if _has_organism_field(model):
|
92
|
-
from lnschema_bionty._bionty import create_or_get_organism_record
|
93
|
-
|
94
|
-
organism_record = create_or_get_organism_record(
|
95
|
-
organism=kwargs.get("organism"), orm=model
|
96
|
-
)
|
97
|
-
if organism_record is not None:
|
98
|
-
kwargs.update({"organism": organism_record})
|
99
|
-
condition.update({"organism": organism_record})
|
100
|
-
|
101
96
|
# standardize based on the DB reference
|
102
97
|
# log synonyms mapped terms
|
103
98
|
result = model.inspect(
|
@@ -322,3 +317,13 @@ def _has_organism_field(orm: Registry) -> bool:
|
|
322
317
|
return True
|
323
318
|
except FieldDoesNotExist:
|
324
319
|
return False
|
320
|
+
|
321
|
+
|
322
|
+
def _get_organism_record(field: StrField, organism: str | Registry) -> Registry:
|
323
|
+
model = field.field.model
|
324
|
+
if _has_organism_field(model):
|
325
|
+
from lnschema_bionty._bionty import create_or_get_organism_record
|
326
|
+
|
327
|
+
organism_record = create_or_get_organism_record(organism=organism, orm=model)
|
328
|
+
if organism_record is not None:
|
329
|
+
return organism_record
|
lamindb/_registry.py
CHANGED
@@ -129,7 +129,11 @@ def __init__(orm: Registry, *args, **kwargs):
|
|
129
129
|
@classmethod # type:ignore
|
130
130
|
@doc_args(Registry.from_values.__doc__)
|
131
131
|
def from_values(
|
132
|
-
cls,
|
132
|
+
cls,
|
133
|
+
values: ListLike,
|
134
|
+
field: StrField | None = None,
|
135
|
+
organism: Registry | str | None = None,
|
136
|
+
public_source: Registry | None = None,
|
133
137
|
) -> list[Registry]:
|
134
138
|
"""{}."""
|
135
139
|
from_public = True if cls.__module__.startswith("lnschema_bionty.") else False
|
@@ -138,7 +142,8 @@ def from_values(
|
|
138
142
|
iterable=values,
|
139
143
|
field=getattr(cls, field_str),
|
140
144
|
from_public=from_public,
|
141
|
-
|
145
|
+
organism=organism,
|
146
|
+
public_source=public_source,
|
142
147
|
)
|
143
148
|
|
144
149
|
|
lamindb/core/__init__.py
CHANGED
@@ -14,14 +14,21 @@ Registries:
|
|
14
14
|
LabelManager
|
15
15
|
IsTree
|
16
16
|
IsVersioned
|
17
|
-
DataFrameAnnotator
|
18
|
-
AnnDataAnnotator
|
19
|
-
AnnotateLookup
|
20
17
|
CanValidate
|
21
18
|
HasParents
|
22
19
|
InspectResult
|
23
20
|
fields
|
24
21
|
|
22
|
+
Annotators:
|
23
|
+
|
24
|
+
.. autosummary::
|
25
|
+
:toctree: .
|
26
|
+
|
27
|
+
DataFrameAnnotator
|
28
|
+
AnnDataAnnotator
|
29
|
+
MuDataAnnotator
|
30
|
+
AnnotateLookup
|
31
|
+
|
25
32
|
Classes:
|
26
33
|
|
27
34
|
.. autosummary::
|
@@ -53,7 +60,12 @@ from lnschema_core.models import (
|
|
53
60
|
Registry,
|
54
61
|
)
|
55
62
|
|
56
|
-
from lamindb._annotate import
|
63
|
+
from lamindb._annotate import (
|
64
|
+
AnnDataAnnotator,
|
65
|
+
AnnotateLookup,
|
66
|
+
DataFrameAnnotator,
|
67
|
+
MuDataAnnotator,
|
68
|
+
)
|
57
69
|
from lamindb._query_manager import QueryManager
|
58
70
|
from lamindb._query_set import QuerySet, RecordsList
|
59
71
|
from lamindb.core._feature_manager import FeatureManager
|
lamindb/core/_data.py
CHANGED
@@ -109,17 +109,7 @@ def describe(self: Data):
|
|
109
109
|
else:
|
110
110
|
direct_fields.append(f.name)
|
111
111
|
|
112
|
-
#
|
113
|
-
# display line by line the foreign key fields
|
114
|
-
from lamindb._parents import _transform_emoji
|
115
|
-
|
116
|
-
emojis = {
|
117
|
-
"storage": "🗃️",
|
118
|
-
"created_by": "👤",
|
119
|
-
"transform": _transform_emoji(self.transform),
|
120
|
-
"run": "👣",
|
121
|
-
"artifact": "📄",
|
122
|
-
}
|
112
|
+
# provenance
|
123
113
|
if len(foreign_key_fields) > 0: # always True for Artifact and Collection
|
124
114
|
record_msg = f"{colors.green(model_name)}{__repr__(self, include_foreign_keys=False).lstrip(model_name)}"
|
125
115
|
msg += f"{record_msg}\n\n"
|
@@ -127,17 +117,16 @@ def describe(self: Data):
|
|
127
117
|
msg += f"{colors.green('Provenance')}:\n "
|
128
118
|
related_msg = "".join(
|
129
119
|
[
|
130
|
-
f"
|
131
|
-
for
|
132
|
-
if self.__getattribute__(
|
120
|
+
f"📎 {field}: {self.__getattribute__(field)}\n "
|
121
|
+
for field in foreign_key_fields
|
122
|
+
if self.__getattribute__(field) is not None
|
133
123
|
]
|
134
124
|
)
|
135
125
|
msg += related_msg
|
136
126
|
# input of
|
137
|
-
# can only access many-to-many once record is saved
|
138
127
|
if self.id is not None and self.input_of.exists():
|
139
128
|
values = [format_field_value(i.started_at) for i in self.input_of.all()]
|
140
|
-
msg += f"
|
129
|
+
msg += f"📎 input_of ({colors.italic('core.Run')}): {values}\n "
|
141
130
|
msg = msg.rstrip(" ") # do not use removesuffix as we need to remove 2 or 4 spaces
|
142
131
|
msg += print_features(self)
|
143
132
|
msg += print_labels(self)
|