lamindb 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +55 -20
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +55 -44
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +41 -60
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +19 -20
- lamindb/_save.py +2 -2
- lamindb/_transform.py +27 -16
- lamindb/core/_data.py +14 -16
- lamindb/core/_feature_manager.py +34 -44
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/storage/__init__.py +1 -0
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/METADATA +5 -5
- lamindb-0.75.0.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/WHEEL +0 -0
lamindb/_curate.py
CHANGED
@@ -9,7 +9,6 @@ from lamin_utils import colors, logger
|
|
9
9
|
from lamindb_setup.core._docs import doc_args
|
10
10
|
from lnschema_core import (
|
11
11
|
Artifact,
|
12
|
-
Collection,
|
13
12
|
Feature,
|
14
13
|
Record,
|
15
14
|
Run,
|
@@ -92,6 +91,7 @@ class DataFrameCurator:
|
|
92
91
|
using: The reference instance containing registries to validate against.
|
93
92
|
verbosity: The verbosity level.
|
94
93
|
organism: The organism name.
|
94
|
+
sources: A dictionary mapping column names to Source records.
|
95
95
|
|
96
96
|
Examples:
|
97
97
|
>>> import bionty as bt
|
@@ -109,6 +109,7 @@ class DataFrameCurator:
|
|
109
109
|
using: str | None = None,
|
110
110
|
verbosity: str = "hint",
|
111
111
|
organism: str | None = None,
|
112
|
+
sources: dict[str, Record] | None = None,
|
112
113
|
) -> None:
|
113
114
|
from lamindb.core._settings import settings
|
114
115
|
|
@@ -121,6 +122,9 @@ class DataFrameCurator:
|
|
121
122
|
self._collection = None
|
122
123
|
self._validated = False
|
123
124
|
self._kwargs = {"organism": organism} if organism else {}
|
125
|
+
if sources is None:
|
126
|
+
sources = {}
|
127
|
+
self._sources = sources
|
124
128
|
self._save_columns()
|
125
129
|
|
126
130
|
@property
|
@@ -158,6 +162,7 @@ class DataFrameCurator:
|
|
158
162
|
save_function="add_new_from_columns",
|
159
163
|
using=self._using,
|
160
164
|
validated_only=False,
|
165
|
+
source=self._sources.get("columns"),
|
161
166
|
**kwargs,
|
162
167
|
)
|
163
168
|
|
@@ -172,6 +177,7 @@ class DataFrameCurator:
|
|
172
177
|
using=self._using,
|
173
178
|
validated_only=validated_only,
|
174
179
|
df=self._df, # Get the Feature type from df
|
180
|
+
source=self._sources.get("columns"),
|
175
181
|
**kwargs,
|
176
182
|
)
|
177
183
|
|
@@ -222,6 +228,7 @@ class DataFrameCurator:
|
|
222
228
|
key=categorical,
|
223
229
|
using=self._using,
|
224
230
|
validated_only=validated_only,
|
231
|
+
sources=self._sources.get(categorical),
|
225
232
|
**kwargs,
|
226
233
|
)
|
227
234
|
|
@@ -242,6 +249,7 @@ class DataFrameCurator:
|
|
242
249
|
self._df,
|
243
250
|
fields=self.fields,
|
244
251
|
using=self._using,
|
252
|
+
sources=self._sources,
|
245
253
|
**self._kwargs,
|
246
254
|
)
|
247
255
|
return self._validated
|
@@ -283,41 +291,6 @@ class DataFrameCurator:
|
|
283
291
|
|
284
292
|
return self._artifact
|
285
293
|
|
286
|
-
def save_collection(
|
287
|
-
self,
|
288
|
-
artifact: Artifact | Iterable[Artifact],
|
289
|
-
name: str,
|
290
|
-
description: str | None = None,
|
291
|
-
reference: str | None = None,
|
292
|
-
reference_type: str | None = None,
|
293
|
-
) -> Collection:
|
294
|
-
"""Save a collection from artifact/artifacts.
|
295
|
-
|
296
|
-
Args:
|
297
|
-
artifact: One or several saved Artifacts.
|
298
|
-
name: Title of the publication.
|
299
|
-
description: Description of the publication.
|
300
|
-
reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
|
301
|
-
reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
|
302
|
-
"""
|
303
|
-
collection = Collection(
|
304
|
-
artifact,
|
305
|
-
name=name,
|
306
|
-
description=description,
|
307
|
-
reference=reference,
|
308
|
-
reference_type=reference_type,
|
309
|
-
)
|
310
|
-
slug = ln_setup.settings.instance.slug
|
311
|
-
if collection._state.adding:
|
312
|
-
collection.save()
|
313
|
-
else: # pragma: no cover
|
314
|
-
collection.save()
|
315
|
-
logger.warning(f"collection already exists in {colors.italic(slug)}!")
|
316
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
317
|
-
logger.print(f"go to https://lamin.ai/{slug}/collection/{collection.uid}")
|
318
|
-
self._collection = collection
|
319
|
-
return collection
|
320
|
-
|
321
294
|
def clean_up_failed_runs(self):
|
322
295
|
"""Clean up previous failed runs that don't save any outputs."""
|
323
296
|
from lamindb.core._run_context import run_context
|
@@ -338,6 +311,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
338
311
|
using: A reference LaminDB instance.
|
339
312
|
verbosity: The verbosity level.
|
340
313
|
organism: The organism name.
|
314
|
+
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
341
315
|
|
342
316
|
Examples:
|
343
317
|
>>> import bionty as bt
|
@@ -357,11 +331,14 @@ class AnnDataCurator(DataFrameCurator):
|
|
357
331
|
using: str = "default",
|
358
332
|
verbosity: str = "hint",
|
359
333
|
organism: str | None = None,
|
334
|
+
sources: dict[str, Record] | None = None,
|
360
335
|
) -> None:
|
361
336
|
from lamindb_setup.core import upath
|
362
337
|
|
363
338
|
from ._artifact import data_is_anndata
|
364
339
|
|
340
|
+
if sources is None:
|
341
|
+
sources = {}
|
365
342
|
if not data_is_anndata(data):
|
366
343
|
raise ValueError(
|
367
344
|
"data has to be an AnnData object or a path to AnnData-like"
|
@@ -381,6 +358,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
381
358
|
using=using,
|
382
359
|
verbosity=verbosity,
|
383
360
|
organism=organism,
|
361
|
+
sources=sources,
|
384
362
|
)
|
385
363
|
self._obs_fields = categoricals
|
386
364
|
self._save_from_var_index(validated_only=True, **self._kwargs)
|
@@ -421,6 +399,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
421
399
|
using=self._using,
|
422
400
|
validated_only=validated_only,
|
423
401
|
organism=organism,
|
402
|
+
source=self._sources.get("var_index"),
|
424
403
|
)
|
425
404
|
|
426
405
|
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
@@ -455,7 +434,11 @@ class AnnDataCurator(DataFrameCurator):
|
|
455
434
|
**self._kwargs,
|
456
435
|
)
|
457
436
|
validated_obs = validate_categories_in_df(
|
458
|
-
self._adata.obs,
|
437
|
+
self._adata.obs,
|
438
|
+
fields=self.categoricals,
|
439
|
+
using=self._using,
|
440
|
+
sources=self._sources,
|
441
|
+
**self._kwargs,
|
459
442
|
)
|
460
443
|
self._validated = validated_var and validated_obs
|
461
444
|
return self._validated
|
@@ -519,7 +502,11 @@ class MuDataCurator:
|
|
519
502
|
using: str = "default",
|
520
503
|
verbosity: str = "hint",
|
521
504
|
organism: str | None = None,
|
505
|
+
sources: dict[str, Record] | None = None,
|
522
506
|
) -> None:
|
507
|
+
if sources is None:
|
508
|
+
sources = {}
|
509
|
+
self._sources = sources
|
523
510
|
self._mdata = mdata
|
524
511
|
self._kwargs = {"organism": organism} if organism else {}
|
525
512
|
self._var_fields = var_index
|
@@ -534,6 +521,7 @@ class MuDataCurator:
|
|
534
521
|
categoricals=self._obs_fields.get(modality, {}),
|
535
522
|
using=using,
|
536
523
|
verbosity=verbosity,
|
524
|
+
sources=self._sources.get(modality),
|
537
525
|
**self._kwargs,
|
538
526
|
)
|
539
527
|
for modality in self._modalities
|
@@ -713,7 +701,11 @@ class MuDataCurator:
|
|
713
701
|
else:
|
714
702
|
obs = self._mdata[modality].obs
|
715
703
|
validated_obs &= validate_categories_in_df(
|
716
|
-
obs,
|
704
|
+
obs,
|
705
|
+
fields=fields,
|
706
|
+
using=self._using,
|
707
|
+
sources=self._sources.get(modality),
|
708
|
+
**self._kwargs,
|
717
709
|
)
|
718
710
|
self._validated = validated_var and validated_obs
|
719
711
|
return self._validated
|
@@ -776,6 +768,7 @@ class Curate:
|
|
776
768
|
using: str = "default",
|
777
769
|
verbosity: str = "hint",
|
778
770
|
organism: str | None = None,
|
771
|
+
sources: dict[str, Record] | None = None,
|
779
772
|
) -> AnnDataCurator:
|
780
773
|
"""{}""" # noqa: D415
|
781
774
|
return AnnDataCurator(
|
@@ -785,6 +778,7 @@ class Curate:
|
|
785
778
|
using=using,
|
786
779
|
verbosity=verbosity,
|
787
780
|
organism=organism,
|
781
|
+
sources=sources,
|
788
782
|
)
|
789
783
|
|
790
784
|
@classmethod
|
@@ -848,6 +842,7 @@ def validate_categories(
|
|
848
842
|
key: str,
|
849
843
|
using: str | None = None,
|
850
844
|
organism: str | None = None,
|
845
|
+
source: Record | None = None,
|
851
846
|
) -> bool:
|
852
847
|
"""Validate ontology terms in a pandas series using LaminDB registries."""
|
853
848
|
from lamindb._from_values import _print_values
|
@@ -862,6 +857,7 @@ def validate_categories(
|
|
862
857
|
|
863
858
|
registry = field.field.model
|
864
859
|
filter_kwargs = check_registry_organism(registry, organism)
|
860
|
+
filter_kwargs.update({"source": source} if source else {})
|
865
861
|
|
866
862
|
# Inspect the default instance
|
867
863
|
inspect_result = standardize_and_inspect(
|
@@ -927,9 +923,12 @@ def validate_categories_in_df(
|
|
927
923
|
df: pd.DataFrame,
|
928
924
|
fields: dict[str, FieldAttr],
|
929
925
|
using: str | None = None,
|
926
|
+
sources: dict[str, Record] = None,
|
930
927
|
**kwargs,
|
931
928
|
) -> bool:
|
932
929
|
"""Validate categories in DataFrame columns using LaminDB registries."""
|
930
|
+
if sources is None:
|
931
|
+
sources = {}
|
933
932
|
validated = True
|
934
933
|
for key, field in fields.items():
|
935
934
|
validated &= validate_categories(
|
@@ -937,6 +936,7 @@ def validate_categories_in_df(
|
|
937
936
|
field=field,
|
938
937
|
key=key,
|
939
938
|
using=using,
|
939
|
+
source=sources.get(key),
|
940
940
|
**kwargs,
|
941
941
|
)
|
942
942
|
return validated
|
@@ -998,13 +998,13 @@ def save_artifact(
|
|
998
998
|
organism,
|
999
999
|
)
|
1000
1000
|
|
1001
|
-
if artifact.
|
1001
|
+
if artifact._accessor == "DataFrame":
|
1002
1002
|
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
1003
|
-
elif artifact.
|
1003
|
+
elif artifact._accessor == "AnnData":
|
1004
1004
|
artifact.features._add_set_from_anndata(
|
1005
1005
|
var_field=columns_field, **feature_kwargs
|
1006
1006
|
)
|
1007
|
-
elif artifact.
|
1007
|
+
elif artifact._accessor == "MuData":
|
1008
1008
|
artifact.features._add_set_from_mudata(
|
1009
1009
|
var_fields=columns_field, **feature_kwargs
|
1010
1010
|
)
|
@@ -1021,7 +1021,7 @@ def save_artifact(
|
|
1021
1021
|
labels = registry.from_values(df[key], field=field, **filter_kwargs)
|
1022
1022
|
artifact.labels.add(labels, feature)
|
1023
1023
|
|
1024
|
-
if artifact.
|
1024
|
+
if artifact._accessor == "MuData":
|
1025
1025
|
for modality, modality_fields in fields.items():
|
1026
1026
|
if modality == "obs":
|
1027
1027
|
_add_labels(data, artifact, modality_fields)
|
@@ -1046,6 +1046,7 @@ def update_registry(
|
|
1046
1046
|
df: pd.DataFrame | None = None,
|
1047
1047
|
organism: str | None = None,
|
1048
1048
|
dtype: str | None = None,
|
1049
|
+
source: Record | None = None,
|
1049
1050
|
**kwargs,
|
1050
1051
|
) -> list[Record]:
|
1051
1052
|
"""Save features or labels records in the default instance from the using instance.
|
@@ -1060,6 +1061,7 @@ def update_registry(
|
|
1060
1061
|
df: A DataFrame to save labels from.
|
1061
1062
|
organism: The organism name.
|
1062
1063
|
dtype: The type of the feature.
|
1064
|
+
source: The source record.
|
1063
1065
|
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1064
1066
|
"""
|
1065
1067
|
from lamindb._save import save as ln_save
|
@@ -1067,6 +1069,7 @@ def update_registry(
|
|
1067
1069
|
|
1068
1070
|
registry = field.field.model
|
1069
1071
|
filter_kwargs = check_registry_organism(registry, organism)
|
1072
|
+
filter_kwargs.update({"source": source} if source else {})
|
1070
1073
|
|
1071
1074
|
verbosity = settings.verbosity
|
1072
1075
|
try:
|
@@ -1098,6 +1101,10 @@ def update_registry(
|
|
1098
1101
|
if non_validated_labels
|
1099
1102
|
else []
|
1100
1103
|
)
|
1104
|
+
# here we check to only save the public records if they are from the specified source
|
1105
|
+
# TODO: this if shouldn't be needed
|
1106
|
+
if source:
|
1107
|
+
public_records = [r for r in public_records if r.source == source]
|
1101
1108
|
ln_save(public_records)
|
1102
1109
|
labels_saved["from public"] = [
|
1103
1110
|
getattr(r, field.field.name) for r in public_records
|
@@ -1119,7 +1126,11 @@ def update_registry(
|
|
1119
1126
|
if registry == Feature:
|
1120
1127
|
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1121
1128
|
non_validated_records.append(
|
1122
|
-
registry(
|
1129
|
+
registry(
|
1130
|
+
**init_kwargs,
|
1131
|
+
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
1132
|
+
**{k: v for k, v in kwargs.items() if k != "sources"},
|
1133
|
+
)
|
1123
1134
|
)
|
1124
1135
|
ln_save(non_validated_records)
|
1125
1136
|
|
@@ -1242,7 +1253,7 @@ def _save_organism(name: str): # pragma: no cover
|
|
1242
1253
|
|
1243
1254
|
organism = bt.Organism.filter(name=name).one_or_none()
|
1244
1255
|
if organism is None:
|
1245
|
-
organism = bt.Organism.
|
1256
|
+
organism = bt.Organism.from_source(name=name)
|
1246
1257
|
if organism is None:
|
1247
1258
|
raise ValueError(
|
1248
1259
|
f"Organism '{name}' not found\n"
|
lamindb/_feature_set.py
CHANGED
@@ -118,7 +118,7 @@ def from_values(
|
|
118
118
|
name: str | None = None,
|
119
119
|
mute: bool = False,
|
120
120
|
organism: Record | str | None = None,
|
121
|
-
|
121
|
+
source: Record | None = None,
|
122
122
|
raise_validation_error: bool = True,
|
123
123
|
) -> FeatureSet:
|
124
124
|
"""{}""" # noqa: D415
|
@@ -139,7 +139,7 @@ def from_values(
|
|
139
139
|
not_validated_values = values_array[~validated]
|
140
140
|
msg = (
|
141
141
|
f"These values could not be validated: {not_validated_values.tolist()}\n"
|
142
|
-
f"If there are no typos, add them to their registry: {registry}"
|
142
|
+
f"If there are no typos, add them to their registry: {registry.__name__}"
|
143
143
|
)
|
144
144
|
if raise_validation_error:
|
145
145
|
raise ValidationError(msg)
|
@@ -149,7 +149,7 @@ def from_values(
|
|
149
149
|
validated_values,
|
150
150
|
field=field,
|
151
151
|
organism=organism,
|
152
|
-
|
152
|
+
source=source,
|
153
153
|
)
|
154
154
|
feature_set = FeatureSet(
|
155
155
|
features=validated_features,
|
@@ -168,7 +168,7 @@ def from_df(
|
|
168
168
|
name: str | None = None,
|
169
169
|
mute: bool = False,
|
170
170
|
organism: Record | str | None = None,
|
171
|
-
|
171
|
+
source: Record | None = None,
|
172
172
|
) -> FeatureSet | None:
|
173
173
|
"""{}""" # noqa: D415
|
174
174
|
registry = field.field.model
|
@@ -189,7 +189,7 @@ def from_df(
|
|
189
189
|
df.columns[validated],
|
190
190
|
field=field,
|
191
191
|
organism=organism,
|
192
|
-
|
192
|
+
source=source,
|
193
193
|
)
|
194
194
|
feature_set = FeatureSet(
|
195
195
|
features=validated_features,
|
lamindb/_filter.py
CHANGED
@@ -21,9 +21,9 @@ def filter(Record: type[Record], **expressions) -> QuerySet:
|
|
21
21
|
):
|
22
22
|
visibility = "visibility"
|
23
23
|
if not any(e.startswith(visibility) for e in expressions):
|
24
|
-
expressions[
|
25
|
-
|
26
|
-
|
24
|
+
expressions[visibility] = (
|
25
|
+
VisibilityChoice.default.value
|
26
|
+
) # default visibility
|
27
27
|
# if visibility is None, do not apply a filter
|
28
28
|
# otherwise, it would mean filtering for NULL values, which doesn't make
|
29
29
|
# sense for a non-NULLABLE column
|
lamindb/_finish.py
CHANGED
@@ -80,8 +80,8 @@ def save_run_context_core(
|
|
80
80
|
|
81
81
|
# for scripts, things are easy
|
82
82
|
is_consecutive = True
|
83
|
-
is_notebook = transform.type ==
|
84
|
-
|
83
|
+
is_notebook = transform.type == "notebook"
|
84
|
+
_source_code_artifact_path = filepath
|
85
85
|
# for notebooks, we need more work
|
86
86
|
if is_notebook:
|
87
87
|
try:
|
@@ -134,12 +134,12 @@ def save_run_context_core(
|
|
134
134
|
)
|
135
135
|
# strip the output from the notebook to create the source code file
|
136
136
|
# first, copy the notebook file to a temporary file in the cache
|
137
|
-
|
138
|
-
shutil.copy2(filepath,
|
137
|
+
_source_code_artifact_path = ln_setup.settings.storage.cache_dir / filepath.name
|
138
|
+
shutil.copy2(filepath, _source_code_artifact_path) # copy
|
139
139
|
subprocess.run(
|
140
140
|
[
|
141
141
|
"nbstripout",
|
142
|
-
|
142
|
+
_source_code_artifact_path,
|
143
143
|
"--extra-keys",
|
144
144
|
"metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
|
145
145
|
],
|
@@ -152,31 +152,34 @@ def save_run_context_core(
|
|
152
152
|
transform_family = transform.versions
|
153
153
|
if len(transform_family) > 0:
|
154
154
|
for prev_transform in transform_family.order_by("-created_at"):
|
155
|
-
if
|
156
|
-
|
157
|
-
|
158
|
-
|
155
|
+
if (
|
156
|
+
prev_transform.latest_run is not None
|
157
|
+
and prev_transform.latest_run.report_id is not None
|
158
|
+
):
|
159
|
+
prev_report = prev_transform.latest_run.report
|
160
|
+
if prev_transform._source_code_artifact_id is not None:
|
161
|
+
prev_source = prev_transform._source_code_artifact
|
159
162
|
ln.settings.creation.artifact_silence_missing_run_warning = True
|
160
163
|
|
161
164
|
# track source code
|
162
|
-
if transform.
|
165
|
+
if transform._source_code_artifact_id is not None:
|
163
166
|
# check if the hash of the transform source code matches
|
164
167
|
# (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
|
165
|
-
hash, _ = hash_file(
|
166
|
-
if hash != transform.
|
168
|
+
hash, _ = hash_file(_source_code_artifact_path) # ignore hash_type for now
|
169
|
+
if hash != transform._source_code_artifact.hash:
|
167
170
|
if os.getenv("LAMIN_TESTING") is None:
|
168
171
|
# in test, auto-confirm overwrite
|
169
172
|
response = input(
|
170
|
-
f"You are about to replace (overwrite) existing source code (hash '{transform.
|
173
|
+
f"You are about to replace (overwrite) existing source code (hash '{transform._source_code_artifact.hash}') for transform version"
|
171
174
|
f" '{transform.version}'. Proceed? (y/n)"
|
172
175
|
)
|
173
176
|
else:
|
174
177
|
response = "y"
|
175
178
|
if response == "y":
|
176
|
-
transform.
|
177
|
-
transform.
|
179
|
+
transform._source_code_artifact.replace(_source_code_artifact_path)
|
180
|
+
transform._source_code_artifact.save(upload=True)
|
178
181
|
logger.success(
|
179
|
-
f"replaced transform.
|
182
|
+
f"replaced transform._source_code_artifact: {transform._source_code_artifact}"
|
180
183
|
)
|
181
184
|
else:
|
182
185
|
logger.warning("Please re-run `ln.track()` to make a new version")
|
@@ -184,17 +187,19 @@ def save_run_context_core(
|
|
184
187
|
else:
|
185
188
|
logger.important("source code is already saved")
|
186
189
|
else:
|
187
|
-
|
188
|
-
|
190
|
+
_source_code_artifact = ln.Artifact(
|
191
|
+
_source_code_artifact_path,
|
189
192
|
description=f"Source of transform {transform.uid}",
|
190
193
|
version=transform.version,
|
191
194
|
is_new_version_of=prev_source,
|
192
195
|
visibility=0, # hidden file
|
193
196
|
run=False,
|
194
197
|
)
|
195
|
-
|
196
|
-
transform.
|
197
|
-
logger.debug(
|
198
|
+
_source_code_artifact.save(upload=True, print_progress=False)
|
199
|
+
transform._source_code_artifact = _source_code_artifact
|
200
|
+
logger.debug(
|
201
|
+
f"saved transform._source_code_artifact: {transform._source_code_artifact}"
|
202
|
+
)
|
198
203
|
|
199
204
|
# track environment
|
200
205
|
env_path = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
|
@@ -257,8 +262,9 @@ def save_run_context_core(
|
|
257
262
|
run.report = report_file
|
258
263
|
run.is_consecutive = is_consecutive
|
259
264
|
run.save()
|
260
|
-
|
261
|
-
|
265
|
+
logger.debug(
|
266
|
+
f"saved transform.latest_run.report: {transform.latest_run.report}"
|
267
|
+
)
|
262
268
|
transform.save()
|
263
269
|
|
264
270
|
# finalize
|
lamindb/_from_values.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING,
|
3
|
+
from typing import TYPE_CHECKING, Iterable
|
4
4
|
|
5
5
|
import pandas as pd
|
6
6
|
from django.core.exceptions import FieldDoesNotExist
|
@@ -19,9 +19,9 @@ def get_or_create_records(
|
|
19
19
|
field: StrField,
|
20
20
|
*,
|
21
21
|
create: bool = False,
|
22
|
-
|
22
|
+
from_source: bool = False,
|
23
23
|
organism: Record | str | None = None,
|
24
|
-
|
24
|
+
source: Record | None = None,
|
25
25
|
mute: bool = False,
|
26
26
|
) -> list[Record]:
|
27
27
|
"""Get or create records from iterables."""
|
@@ -34,8 +34,8 @@ def get_or_create_records(
|
|
34
34
|
kwargs: dict = {}
|
35
35
|
if organism is not None:
|
36
36
|
kwargs["organism"] = organism
|
37
|
-
if
|
38
|
-
kwargs["
|
37
|
+
if source is not None:
|
38
|
+
kwargs["source"] = source
|
39
39
|
settings.creation.search_names = False
|
40
40
|
try:
|
41
41
|
iterable_idx = index_iterable(iterable)
|
@@ -47,8 +47,17 @@ def get_or_create_records(
|
|
47
47
|
|
48
48
|
# new records to be created based on new values
|
49
49
|
if len(nonexist_values) > 0:
|
50
|
-
if
|
51
|
-
|
50
|
+
if source:
|
51
|
+
from_source = not source.in_db
|
52
|
+
elif (
|
53
|
+
records
|
54
|
+
and hasattr(records[0], "source_id")
|
55
|
+
and records[0].source_id
|
56
|
+
and records[0].source.in_db
|
57
|
+
):
|
58
|
+
from_source = False
|
59
|
+
if from_source:
|
60
|
+
records_bionty, unmapped_values = create_records_from_source(
|
52
61
|
iterable_idx=nonexist_values,
|
53
62
|
field=field,
|
54
63
|
msg=msg,
|
@@ -58,7 +67,7 @@ def get_or_create_records(
|
|
58
67
|
if len(records_bionty) > 0:
|
59
68
|
msg = ""
|
60
69
|
for record in records_bionty:
|
61
|
-
record.
|
70
|
+
record._from_source = True
|
62
71
|
records += records_bionty
|
63
72
|
else:
|
64
73
|
unmapped_values = nonexist_values
|
@@ -75,7 +84,7 @@ def get_or_create_records(
|
|
75
84
|
f"{colors.red('did not create')} {name} record{s} for "
|
76
85
|
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
|
77
86
|
)
|
78
|
-
if Record.__module__.startswith("
|
87
|
+
if Record.__module__.startswith("bionty.") or Record == ULabel:
|
79
88
|
if isinstance(iterable, pd.Series):
|
80
89
|
feature = iterable.name
|
81
90
|
feature_name = None
|
@@ -100,8 +109,8 @@ def get_existing_records(
|
|
100
109
|
model = field.field.model
|
101
110
|
condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
|
102
111
|
# existing records matching is agnostic to the bionty source
|
103
|
-
if "
|
104
|
-
condition.pop("
|
112
|
+
if "source" in condition:
|
113
|
+
condition.pop("source")
|
105
114
|
|
106
115
|
# standardize based on the DB reference
|
107
116
|
# log synonyms mapped terms
|
@@ -109,7 +118,7 @@ def get_existing_records(
|
|
109
118
|
iterable_idx,
|
110
119
|
field=field,
|
111
120
|
organism=kwargs.get("organism"),
|
112
|
-
|
121
|
+
source=kwargs.get("source"),
|
113
122
|
mute=True,
|
114
123
|
)
|
115
124
|
syn_mapper = result.synonyms_mapper
|
@@ -174,7 +183,7 @@ def get_existing_records(
|
|
174
183
|
return records, nonexist_values, msg
|
175
184
|
|
176
185
|
|
177
|
-
def
|
186
|
+
def create_records_from_source(
|
178
187
|
iterable_idx: pd.Index,
|
179
188
|
field: StrField,
|
180
189
|
msg: str = "",
|
@@ -184,7 +193,8 @@ def create_records_from_public(
|
|
184
193
|
model = field.field.model
|
185
194
|
records: list = []
|
186
195
|
# populate additional fields from bionty
|
187
|
-
from
|
196
|
+
from bionty._bionty import get_source_record
|
197
|
+
from bionty.core._bionty import filter_bionty_df_columns
|
188
198
|
|
189
199
|
# create the corresponding bionty object from model
|
190
200
|
try:
|
@@ -195,17 +205,20 @@ def create_records_from_public(
|
|
195
205
|
organism = "human"
|
196
206
|
elif iterable_idx[0].startswith("ENSMUSG"):
|
197
207
|
organism = "mouse"
|
198
|
-
public_ontology = model.public(
|
199
|
-
organism=organism, public_source=kwargs.get("public_source")
|
200
|
-
)
|
208
|
+
public_ontology = model.public(organism=organism, source=kwargs.get("source"))
|
201
209
|
except Exception:
|
202
210
|
# for custom records that are not created from public sources
|
203
211
|
return records, iterable_idx
|
204
|
-
# add
|
205
|
-
|
212
|
+
# add source record to the kwargs
|
213
|
+
source_record = get_source_record(public_ontology)
|
214
|
+
if source_record is not None and source_record.in_db:
|
215
|
+
# skips the creation of records from public if the source is already in the db
|
216
|
+
return records, iterable_idx
|
217
|
+
|
218
|
+
kwargs.update({"source": source_record})
|
206
219
|
|
207
220
|
# filter the columns in bionty df based on fields
|
208
|
-
bionty_df =
|
221
|
+
bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
|
209
222
|
|
210
223
|
# standardize in the bionty reference
|
211
224
|
result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
|
@@ -301,43 +314,6 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
|
|
301
314
|
return print_values
|
302
315
|
|
303
316
|
|
304
|
-
def _filter_bionty_df_columns(model: Record, public_ontology: Any) -> pd.DataFrame:
|
305
|
-
bionty_df = pd.DataFrame()
|
306
|
-
if public_ontology is not None:
|
307
|
-
model_field_names = {i.name for i in model._meta.fields}
|
308
|
-
# parents needs to be added here as relationships aren't in fields
|
309
|
-
model_field_names.add("parents")
|
310
|
-
bionty_df = public_ontology.df().reset_index()
|
311
|
-
if model.__name__ == "Gene":
|
312
|
-
# groupby ensembl_gene_id and concat ncbi_gene_ids
|
313
|
-
groupby_id_col = (
|
314
|
-
"ensembl_gene_id" if "ensembl_gene_id" in bionty_df else "stable_id"
|
315
|
-
)
|
316
|
-
bionty_df.drop(
|
317
|
-
columns=["hgnc_id", "mgi_id", "index"], errors="ignore", inplace=True
|
318
|
-
)
|
319
|
-
bionty_df.drop_duplicates([groupby_id_col, "ncbi_gene_id"], inplace=True)
|
320
|
-
bionty_df["ncbi_gene_id"] = bionty_df["ncbi_gene_id"].fillna("")
|
321
|
-
bionty_df = (
|
322
|
-
bionty_df.groupby(groupby_id_col)
|
323
|
-
.agg(
|
324
|
-
{
|
325
|
-
"symbol": "first",
|
326
|
-
"ncbi_gene_id": "|".join,
|
327
|
-
"biotype": "first",
|
328
|
-
"description": "first",
|
329
|
-
"synonyms": "first",
|
330
|
-
}
|
331
|
-
)
|
332
|
-
.reset_index()
|
333
|
-
)
|
334
|
-
bionty_df.rename(columns={"ncbi_gene_id": "ncbi_gene_ids"}, inplace=True)
|
335
|
-
# rename definition to description for the lnschema_bionty
|
336
|
-
bionty_df.rename(columns={"definition": "description"}, inplace=True)
|
337
|
-
bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
|
338
|
-
return bionty_df
|
339
|
-
|
340
|
-
|
341
317
|
def _bulk_create_dicts_from_df(
|
342
318
|
keys: set | list, column_name: str, df: pd.DataFrame
|
343
319
|
) -> tuple[dict, str]:
|
@@ -359,7 +335,7 @@ def _bulk_create_dicts_from_df(
|
|
359
335
|
return df.reset_index().to_dict(orient="records"), multi_msg
|
360
336
|
|
361
337
|
|
362
|
-
def _has_organism_field(orm: Record) -> bool:
|
338
|
+
def _has_organism_field(orm: type[Record]) -> bool:
|
363
339
|
try:
|
364
340
|
orm._meta.get_field("organism")
|
365
341
|
return True
|
@@ -371,10 +347,15 @@ def _get_organism_record(
|
|
371
347
|
field: StrField, organism: str | Record, force: bool = False
|
372
348
|
) -> Record:
|
373
349
|
model = field.field.model
|
374
|
-
check = True
|
350
|
+
check = True
|
351
|
+
if not force and hasattr(model, "_ontology_id_field"):
|
352
|
+
check = field.field.name != model._ontology_id_field
|
353
|
+
# e.g. bionty.CellMarker has "name" as _ontology_id_field
|
354
|
+
if not model._ontology_id_field.endswith("id"):
|
355
|
+
check = True
|
375
356
|
|
376
357
|
if _has_organism_field(model) and check:
|
377
|
-
from
|
358
|
+
from bionty._bionty import create_or_get_organism_record
|
378
359
|
|
379
360
|
organism_record = create_or_get_organism_record(organism=organism, orm=model)
|
380
361
|
if organism_record is not None:
|