lamindb 0.48a3__py3-none-any.whl → 0.48.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +11 -16
- lamindb/_context.py +4 -1
- lamindb/_dataset.py +6 -3
- lamindb/_feature.py +9 -3
- lamindb/_feature_manager.py +176 -0
- lamindb/_feature_set.py +22 -18
- lamindb/_file.py +90 -44
- lamindb/_from_values.py +61 -18
- lamindb/_label.py +36 -0
- lamindb/_manager.py +2 -2
- lamindb/_orm.py +144 -27
- lamindb/_queryset.py +4 -2
- lamindb/_save.py +17 -7
- lamindb/dev/__init__.py +4 -0
- lamindb/dev/_view_parents.py +34 -63
- lamindb/dev/datasets/__init__.py +8 -0
- lamindb/dev/datasets/_core.py +80 -15
- {lamindb-0.48a3.dist-info → lamindb-0.48.1.dist-info}/METADATA +6 -6
- {lamindb-0.48a3.dist-info → lamindb-0.48.1.dist-info}/RECORD +22 -21
- {lamindb-0.48a3.dist-info → lamindb-0.48.1.dist-info}/LICENSE +0 -0
- {lamindb-0.48a3.dist-info → lamindb-0.48.1.dist-info}/WHEEL +0 -0
- {lamindb-0.48a3.dist-info → lamindb-0.48.1.dist-info}/entry_points.txt +0 -0
lamindb/__init__.py
CHANGED
@@ -4,11 +4,12 @@ Import the package::
|
|
4
4
|
|
5
5
|
import lamindb as ln
|
6
6
|
|
7
|
-
|
8
|
-
in-memory data objects (`DataFrame`, `AnnData`, etc.) and allows to link them
|
9
|
-
against entities of core schema & custom schemas.
|
7
|
+
.. note::
|
10
8
|
|
11
|
-
|
9
|
+
`File` abstracts over objects in storage from blob-like files (pdf, txt, etc.)
|
10
|
+
to streamable storage backends (HDF5, DuckDB, zarr, TileDB, etc.).
|
11
|
+
|
12
|
+
`Dataset` abstracts over `File` and tables in classical warehouses (BigQuery, Snowflake).
|
12
13
|
|
13
14
|
.. autosummary::
|
14
15
|
:toctree: .
|
@@ -16,18 +17,13 @@ The core schema entities are central to lamindb's API:
|
|
16
17
|
File
|
17
18
|
Dataset
|
18
19
|
Transform
|
19
|
-
Run
|
20
|
-
Feature
|
21
20
|
Label
|
21
|
+
Feature
|
22
|
+
FeatureSet
|
23
|
+
Modality
|
22
24
|
User
|
23
25
|
Storage
|
24
|
-
|
25
|
-
More control over feature management:
|
26
|
-
|
27
|
-
.. autosummary::
|
28
|
-
:toctree: .
|
29
|
-
|
30
|
-
FeatureSet
|
26
|
+
Run
|
31
27
|
|
32
28
|
Functional tools:
|
33
29
|
|
@@ -37,7 +33,6 @@ Functional tools:
|
|
37
33
|
track
|
38
34
|
view
|
39
35
|
save
|
40
|
-
delete
|
41
36
|
|
42
37
|
Static classes & modules:
|
43
38
|
|
@@ -45,7 +40,6 @@ Static classes & modules:
|
|
45
40
|
:toctree: .
|
46
41
|
|
47
42
|
settings
|
48
|
-
context
|
49
43
|
types
|
50
44
|
setup
|
51
45
|
schema
|
@@ -53,7 +47,7 @@ Static classes & modules:
|
|
53
47
|
|
54
48
|
"""
|
55
49
|
|
56
|
-
__version__ = "0.
|
50
|
+
__version__ = "0.48.1" # denote a release candidate for 0.1.0 with 0.1rc1
|
57
51
|
|
58
52
|
import os as _os
|
59
53
|
|
@@ -93,6 +87,7 @@ if _INSTANCE_SETUP:
|
|
93
87
|
FeatureSet,
|
94
88
|
File,
|
95
89
|
Label,
|
90
|
+
Modality,
|
96
91
|
Run,
|
97
92
|
Storage,
|
98
93
|
Transform,
|
lamindb/_context.py
CHANGED
@@ -116,7 +116,7 @@ def get_notebook_name_colab() -> str:
|
|
116
116
|
return name.rstrip(".ipynb")
|
117
117
|
|
118
118
|
|
119
|
-
class
|
119
|
+
class run_context:
|
120
120
|
"""Global run context."""
|
121
121
|
|
122
122
|
instance: Optional[InstanceSettings] = None
|
@@ -434,3 +434,6 @@ class context:
|
|
434
434
|
logger.success(f"Updated: {transform}")
|
435
435
|
|
436
436
|
cls.transform = transform
|
437
|
+
|
438
|
+
|
439
|
+
context = run_context
|
lamindb/_dataset.py
CHANGED
@@ -67,7 +67,7 @@ def from_files(dataset: Dataset, *, name: str, files: Iterable[File]) -> Dataset
|
|
67
67
|
feature_set_file_links = File.feature_sets.through.objects.filter(
|
68
68
|
file_id__in=file_ids
|
69
69
|
)
|
70
|
-
feature_set_ids = [link.
|
70
|
+
feature_set_ids = [link.feature_set_id for link in feature_set_file_links]
|
71
71
|
feature_sets = FeatureSet.select(id__in=feature_set_ids)
|
72
72
|
# validate consistency of feature_sets
|
73
73
|
# we only allow one feature set per type
|
@@ -128,13 +128,16 @@ def delete(dataset: Dataset, storage: bool = False):
|
|
128
128
|
def save(dataset: Dataset):
|
129
129
|
if dataset.file is not None:
|
130
130
|
dataset.file.save()
|
131
|
-
|
131
|
+
feature_sets = dataset._feature_sets
|
132
|
+
if isinstance(feature_sets, dict):
|
133
|
+
feature_sets = feature_sets.values()
|
134
|
+
for feature_set in feature_sets:
|
132
135
|
feature_set.save()
|
133
136
|
super(Dataset, dataset).save()
|
134
137
|
if len(dataset._files) > 0:
|
135
138
|
dataset.files.set(dataset._files)
|
136
139
|
if len(dataset._feature_sets) > 0:
|
137
|
-
dataset.feature_sets.set(
|
140
|
+
dataset.feature_sets.set(feature_sets)
|
138
141
|
|
139
142
|
|
140
143
|
Dataset.__init__ = __init__
|
lamindb/_feature.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from itertools import islice
|
2
|
-
from typing import List
|
2
|
+
from typing import List, Optional, Union
|
3
3
|
|
4
4
|
import pandas as pd
|
5
5
|
from lamin_utils import logger
|
@@ -32,6 +32,12 @@ def __init__(self, *args, **kwargs):
|
|
32
32
|
# now we proceed with the user-facing constructor
|
33
33
|
if len(args) != 0:
|
34
34
|
raise ValueError("Only non-keyword args allowed")
|
35
|
+
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
36
|
+
if type is not None:
|
37
|
+
type_str = type.__name__ if not isinstance(type, str) else type
|
38
|
+
else:
|
39
|
+
type_str = None
|
40
|
+
kwargs["type"] = type_str
|
35
41
|
super(Feature, self).__init__(*args, **kwargs)
|
36
42
|
|
37
43
|
|
@@ -50,7 +56,7 @@ def from_df(cls, df: "pd.DataFrame") -> List["Feature"]:
|
|
50
56
|
categoricals_with_unmapped_categories = {}
|
51
57
|
for name, col in df.items():
|
52
58
|
if name in categoricals:
|
53
|
-
types[name] = "
|
59
|
+
types[name] = "category"
|
54
60
|
categorical = categoricals[name]
|
55
61
|
if hasattr(
|
56
62
|
categorical, "cat"
|
@@ -66,7 +72,7 @@ def from_df(cls, df: "pd.DataFrame") -> List["Feature"]:
|
|
66
72
|
features = Feature.from_values(df.columns, field=Feature.name, types=types)
|
67
73
|
assert len(features) == len(df.columns)
|
68
74
|
|
69
|
-
if len(
|
75
|
+
if len(categoricals_with_unmapped_categories) > 0:
|
70
76
|
n_max = 20
|
71
77
|
categoricals_with_unmapped_categories_formatted = "\n ".join(
|
72
78
|
[
|
@@ -0,0 +1,176 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Iterable, List, Optional, Union
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from lamin_utils import logger
|
6
|
+
from lnschema_core.models import ORM, Dataset, Feature, FeatureSet, File
|
7
|
+
|
8
|
+
from ._queryset import QuerySet
|
9
|
+
from ._save import save
|
10
|
+
|
11
|
+
|
12
|
+
def validate_and_cast_feature(feature) -> Feature:
|
13
|
+
if isinstance(feature, str):
|
14
|
+
feature_name = feature
|
15
|
+
feature = Feature.select(name=feature_name).one_or_none()
|
16
|
+
if feature is None:
|
17
|
+
raise ValueError(
|
18
|
+
f"Please create feature: ln.Feature(name='{feature_name}',"
|
19
|
+
" type='category').save()"
|
20
|
+
)
|
21
|
+
return feature
|
22
|
+
|
23
|
+
|
24
|
+
def create_features_df(
|
25
|
+
file: File, feature_sets: List[FeatureSet], exclude: bool = True
|
26
|
+
):
|
27
|
+
features = []
|
28
|
+
for feature_set in feature_sets:
|
29
|
+
if exclude:
|
30
|
+
features_df = feature_set.features.exclude(labels_orm__isnull=True).df()
|
31
|
+
else:
|
32
|
+
features_df = feature_set.features.df()
|
33
|
+
slots = file.feature_sets.through.objects.filter(
|
34
|
+
file=file, feature_set=feature_set
|
35
|
+
).list("slot")
|
36
|
+
for slot in slots:
|
37
|
+
features_df["slot"] = slot
|
38
|
+
features.append(features_df)
|
39
|
+
features_df = pd.concat(features)
|
40
|
+
return features_df.sort_values(["labels_schema", "labels_orm"])
|
41
|
+
|
42
|
+
|
43
|
+
class FeatureManager:
|
44
|
+
"""Feature manager."""
|
45
|
+
|
46
|
+
def __init__(self, host: Union[File, Dataset]):
|
47
|
+
self._host = host
|
48
|
+
slot_feature_sets = (
|
49
|
+
self._feature_set_df_with_slots().reset_index().set_index("slot")["id"]
|
50
|
+
)
|
51
|
+
self._slots = {
|
52
|
+
slot: self._host.feature_sets.get(id=i)
|
53
|
+
for slot, i in slot_feature_sets.items()
|
54
|
+
}
|
55
|
+
|
56
|
+
def __repr__(self) -> str:
|
57
|
+
if len(self._slots) > 0:
|
58
|
+
msg = "slots:\n"
|
59
|
+
for slot, feature_set in self._slots.items():
|
60
|
+
msg += f" {slot}: {feature_set}\n"
|
61
|
+
return msg
|
62
|
+
else:
|
63
|
+
return "No linked features."
|
64
|
+
|
65
|
+
def __getitem__(self, slot) -> QuerySet:
|
66
|
+
id = (
|
67
|
+
self._host.feature_sets.through.objects.filter(
|
68
|
+
file_id=self._host.id, slot=slot
|
69
|
+
)
|
70
|
+
.one()
|
71
|
+
.feature_set_id
|
72
|
+
)
|
73
|
+
accessor_by_orm = {
|
74
|
+
field.related_model.__name__: field.name
|
75
|
+
for field in self._host._meta.related_objects
|
76
|
+
}
|
77
|
+
accessor_by_orm["Feature"] = "features"
|
78
|
+
feature_set = self._host.feature_sets.filter(id=id).one()
|
79
|
+
return getattr(feature_set, accessor_by_orm[feature_set.ref_orm]).all()
|
80
|
+
|
81
|
+
def _feature_set_df_with_slots(self) -> pd.DataFrame:
|
82
|
+
"""Return DataFrame."""
|
83
|
+
df = self._host.feature_sets.df()
|
84
|
+
df.insert(
|
85
|
+
0,
|
86
|
+
"slot",
|
87
|
+
self._host.feature_sets.through.objects.filter(file_id=self._host.id)
|
88
|
+
.df()
|
89
|
+
.set_index("feature_set_id")
|
90
|
+
.slot,
|
91
|
+
)
|
92
|
+
return df
|
93
|
+
|
94
|
+
def add_labels(
|
95
|
+
self, records: Union[ORM, List[ORM]], feature: Optional[Union[str, ORM]] = None
|
96
|
+
):
|
97
|
+
"""Add one or several labels and associate them with a feature."""
|
98
|
+
if isinstance(records, str) or not isinstance(records, Iterable):
|
99
|
+
records = [records]
|
100
|
+
if isinstance(records[0], str): # type: ignore
|
101
|
+
raise ValueError(
|
102
|
+
"Please pass a record (an ORM object), not a string, e.g., via: label"
|
103
|
+
f" = ln.Label(name='{records[0]}')" # type: ignore
|
104
|
+
)
|
105
|
+
if self._host._state.adding:
|
106
|
+
raise ValueError("Please save the file or dataset before adding a label!")
|
107
|
+
feature = validate_and_cast_feature(feature)
|
108
|
+
records_by_orm = defaultdict(list)
|
109
|
+
records_by_feature_orm = defaultdict(list)
|
110
|
+
for record in records:
|
111
|
+
records_by_orm[record.__class__.__name__].append(record)
|
112
|
+
if feature is None:
|
113
|
+
try:
|
114
|
+
record_feature = (
|
115
|
+
record._feature
|
116
|
+
if hasattr(record, "_feature")
|
117
|
+
else record.feature
|
118
|
+
)
|
119
|
+
except ValueError:
|
120
|
+
raise ValueError("Pass feature argument")
|
121
|
+
else:
|
122
|
+
record_feature = feature
|
123
|
+
records_by_feature_orm[(record_feature, record.__class__.__name__)].append(
|
124
|
+
record
|
125
|
+
)
|
126
|
+
schema_and_accessor_by_orm = {
|
127
|
+
field.related_model.__name__: (
|
128
|
+
field.related_model.__get_schema_name__(),
|
129
|
+
field.name,
|
130
|
+
)
|
131
|
+
for field in self._host._meta.related_objects
|
132
|
+
}
|
133
|
+
schema_and_accessor_by_orm["Label"] = ("core", "labels")
|
134
|
+
for orm_name, records in records_by_orm.items():
|
135
|
+
save(records)
|
136
|
+
getattr(self._host, schema_and_accessor_by_orm[orm_name][1]).set(records)
|
137
|
+
accessor_by_orm = {
|
138
|
+
field.related_model.__name__: field.name
|
139
|
+
for field in self._host._meta.related_objects
|
140
|
+
}
|
141
|
+
accessor_by_orm["Feature"] = "features"
|
142
|
+
feature_sets = self._host.feature_sets.all()
|
143
|
+
feature_sets_by_orm = {
|
144
|
+
feature_set.ref_orm: feature_set for feature_set in feature_sets
|
145
|
+
}
|
146
|
+
for (feature, orm_name), records in records_by_feature_orm.items():
|
147
|
+
feature = validate_and_cast_feature(feature)
|
148
|
+
logger.info(f"Linking feature {feature.name} to {orm_name}")
|
149
|
+
feature.labels_orm = orm_name
|
150
|
+
feature.labels_schema = schema_and_accessor_by_orm[orm_name][0]
|
151
|
+
feature.save()
|
152
|
+
# check whether we have to update the feature set that manages labels
|
153
|
+
# (Feature) to account for a new feature
|
154
|
+
feature_set = feature_sets_by_orm["Feature"]
|
155
|
+
accessor = "features"
|
156
|
+
linked_features = getattr(feature_set, accessor)
|
157
|
+
if feature not in linked_features.all():
|
158
|
+
logger.info(
|
159
|
+
f"Linking feature {feature.name} to feature set {feature_set}"
|
160
|
+
)
|
161
|
+
linked_features.add(feature)
|
162
|
+
feature_set.n += 1
|
163
|
+
feature_set.save()
|
164
|
+
|
165
|
+
def add_feature_set(self, feature_set: FeatureSet, slot: str):
|
166
|
+
if self._host._state.adding:
|
167
|
+
raise ValueError(
|
168
|
+
"Please save the file or dataset before adding a feature set!"
|
169
|
+
)
|
170
|
+
feature_set.save()
|
171
|
+
self._host.feature_sets.add(feature_set)
|
172
|
+
link_record = self._host.feature_sets.through.objects.filter(
|
173
|
+
file=self._host, feature_set=feature_set
|
174
|
+
).one()
|
175
|
+
link_record.slot = slot
|
176
|
+
link_record.save()
|
lamindb/_feature_set.py
CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
|
|
4
4
|
from django.db.models.query_utils import DeferredAttribute as Field
|
5
5
|
from lamin_utils import logger
|
6
6
|
from lamindb_setup.dev._docs import doc_args
|
7
|
-
from lnschema_core import ORM, Feature, FeatureSet
|
7
|
+
from lnschema_core import ORM, Feature, FeatureSet, ids
|
8
8
|
from lnschema_core.types import ListLike
|
9
9
|
|
10
10
|
from lamindb.dev.hashing import hash_set
|
@@ -60,9 +60,14 @@ def __init__(self, *args, **kwargs):
|
|
60
60
|
kwargs.pop("ref_field") if "ref_field" in kwargs else "id"
|
61
61
|
)
|
62
62
|
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
63
|
-
|
63
|
+
modality: Optional[str] = kwargs.pop("modality") if "modality" in kwargs else None
|
64
64
|
name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
|
65
|
-
|
65
|
+
# hash is only internally used
|
66
|
+
hash: Optional[str] = kwargs.pop("hash") if "hash" in kwargs else None
|
67
|
+
if len(kwargs) > 0:
|
68
|
+
raise ValueError(
|
69
|
+
"Only features, ref_field, type, modality, name are valid keyword arguments"
|
70
|
+
)
|
66
71
|
|
67
72
|
# now code
|
68
73
|
features_orm = validate_features(features)
|
@@ -71,29 +76,30 @@ def __init__(self, *args, **kwargs):
|
|
71
76
|
else:
|
72
77
|
type = float
|
73
78
|
n_features = len(features)
|
74
|
-
|
75
|
-
|
76
|
-
feature_set = FeatureSet.select(
|
79
|
+
if hash is None:
|
80
|
+
features_hash = hash_set({feature.id for feature in features})
|
81
|
+
feature_set = FeatureSet.select(hash=features_hash).one_or_none()
|
77
82
|
if feature_set is not None:
|
78
|
-
logger.info("Loaded
|
83
|
+
logger.info(f"Loaded {feature_set}")
|
79
84
|
init_self_from_db(self, feature_set)
|
80
85
|
return None
|
81
86
|
else:
|
82
|
-
|
87
|
+
hash = features_hash
|
83
88
|
self._features = (get_related_name(features_orm), features)
|
84
89
|
if type is not None:
|
85
90
|
type_str = type.__name__ if not isinstance(type, str) else type
|
86
91
|
else:
|
87
92
|
type_str = None
|
88
93
|
super(FeatureSet, self).__init__(
|
89
|
-
id=
|
94
|
+
id=ids.base62_20(),
|
90
95
|
name=name,
|
91
96
|
type=type_str,
|
92
97
|
n=n_features,
|
93
|
-
|
98
|
+
modality=modality,
|
94
99
|
ref_orm=features_orm.__name__,
|
95
100
|
ref_schema=features_orm.__get_schema_name__(),
|
96
101
|
ref_field=ref_field,
|
102
|
+
hash=hash,
|
97
103
|
)
|
98
104
|
|
99
105
|
|
@@ -121,7 +127,7 @@ def from_values(
|
|
121
127
|
field: Field = Feature.name,
|
122
128
|
type: Optional[Union[Type, str]] = None,
|
123
129
|
name: Optional[str] = None,
|
124
|
-
|
130
|
+
modality: Optional[str] = None,
|
125
131
|
**kwargs,
|
126
132
|
) -> "FeatureSet":
|
127
133
|
"""{}"""
|
@@ -135,11 +141,10 @@ def from_values(
|
|
135
141
|
iterable_idx = index_iterable(values)
|
136
142
|
if not isinstance(iterable_idx[0], (str, int)):
|
137
143
|
raise TypeError("values should be list-like of str or int")
|
138
|
-
n_features = len(iterable_idx)
|
139
144
|
features_hash = hash_set(set(iterable_idx))
|
140
|
-
feature_set = FeatureSet.select(
|
145
|
+
feature_set = FeatureSet.select(hash=features_hash).one_or_none()
|
141
146
|
if feature_set is not None:
|
142
|
-
logger.info("
|
147
|
+
logger.info(f"Loaded {feature_set}")
|
143
148
|
else:
|
144
149
|
from_bionty = ORM.__module__.startswith("lnschema_bionty")
|
145
150
|
records = get_or_create_records(
|
@@ -150,13 +155,12 @@ def from_values(
|
|
150
155
|
)
|
151
156
|
# type_str = type.__name__ if not isinstance(type, str) else type
|
152
157
|
feature_set = FeatureSet(
|
153
|
-
|
158
|
+
features=records,
|
159
|
+
hash=features_hash,
|
154
160
|
name=name,
|
155
|
-
|
156
|
-
readout=readout,
|
161
|
+
modality=modality,
|
157
162
|
type=type,
|
158
163
|
ref_field=field.field.name,
|
159
|
-
features=records,
|
160
164
|
)
|
161
165
|
return feature_set
|
162
166
|
|
lamindb/_file.py
CHANGED
@@ -17,6 +17,7 @@ from lnschema_core import Feature, FeatureSet, File, Run, ids
|
|
17
17
|
from lnschema_core.types import AnnDataLike, DataLike, PathLike
|
18
18
|
|
19
19
|
from lamindb._context import context
|
20
|
+
from lamindb.dev import FeatureManager
|
20
21
|
from lamindb.dev._settings import settings
|
21
22
|
from lamindb.dev.hashing import b16_to_b64, hash_file
|
22
23
|
from lamindb.dev.storage import (
|
@@ -353,6 +354,19 @@ def data_is_anndata(data: DataLike):
|
|
353
354
|
return False
|
354
355
|
|
355
356
|
|
357
|
+
def data_is_mudata(data: DataLike):
|
358
|
+
try:
|
359
|
+
from mudata import MuData
|
360
|
+
except ModuleNotFoundError:
|
361
|
+
return False
|
362
|
+
|
363
|
+
if isinstance(data, MuData):
|
364
|
+
return True
|
365
|
+
if isinstance(data, (str, Path, UPath)):
|
366
|
+
return Path(data).suffix in {".h5mu"}
|
367
|
+
return False
|
368
|
+
|
369
|
+
|
356
370
|
def __init__(file: File, *args, **kwargs):
|
357
371
|
# Below checks for the Django-internal call in from_db()
|
358
372
|
# it'd be better if we could avoid this, but not being able to create a File
|
@@ -382,9 +396,7 @@ def __init__(file: File, *args, **kwargs):
|
|
382
396
|
)
|
383
397
|
|
384
398
|
if not len(kwargs) == 0:
|
385
|
-
raise ValueError(
|
386
|
-
"Only data, key, run, description & feature_sets can be passed."
|
387
|
-
)
|
399
|
+
raise ValueError("Only data, key, run, description can be passed.")
|
388
400
|
|
389
401
|
if name is not None and description is not None:
|
390
402
|
raise ValueError("Only pass description, do not pass a name")
|
@@ -392,19 +404,8 @@ def __init__(file: File, *args, **kwargs):
|
|
392
404
|
logger.warning("Argument `name` is deprecated, please use `description`")
|
393
405
|
description = name
|
394
406
|
|
395
|
-
if isinstance(data, pd.DataFrame) and log_hint:
|
396
|
-
logger.hint(
|
397
|
-
"This is a dataframe, consider using File.from_df() to link column"
|
398
|
-
" names as features!"
|
399
|
-
)
|
400
|
-
elif data_is_anndata(data) and log_hint:
|
401
|
-
logger.hint(
|
402
|
-
"This is AnnDataLike, consider using File.from_anndata() to link var_names"
|
403
|
-
" and obs.columns as features!"
|
404
|
-
)
|
405
|
-
|
406
407
|
provisional_id = ids.base62_20()
|
407
|
-
|
408
|
+
kwargs_or_file, privates = get_file_kwargs_from_data(
|
408
409
|
data=data,
|
409
410
|
key=key,
|
410
411
|
run=run,
|
@@ -412,17 +413,38 @@ def __init__(file: File, *args, **kwargs):
|
|
412
413
|
provisional_id=provisional_id,
|
413
414
|
skip_check_exists=skip_check_exists,
|
414
415
|
)
|
416
|
+
|
415
417
|
# an object with the same hash already exists
|
416
|
-
if isinstance(
|
418
|
+
if isinstance(kwargs_or_file, File):
|
417
419
|
# this is the way Django instantiates from the DB internally
|
418
420
|
# https://github.com/django/django/blob/549d6ffeb6d626b023acc40c3bb2093b4b25b3d6/django/db/models/base.py#LL488C1-L491C51
|
419
421
|
new_args = [
|
420
|
-
getattr(
|
422
|
+
getattr(kwargs_or_file, field.attname)
|
423
|
+
for field in file._meta.concrete_fields
|
421
424
|
]
|
422
425
|
super(File, file).__init__(*new_args)
|
423
426
|
file._state.adding = False
|
424
427
|
file._state.db = "default"
|
425
428
|
return None
|
429
|
+
else:
|
430
|
+
kwargs = kwargs_or_file
|
431
|
+
|
432
|
+
if isinstance(data, pd.DataFrame):
|
433
|
+
if log_hint:
|
434
|
+
logger.hint(
|
435
|
+
"This is a dataframe, consider using File.from_df() to link column"
|
436
|
+
" names as features!"
|
437
|
+
)
|
438
|
+
kwargs["accessor"] = "DataFrame"
|
439
|
+
elif data_is_anndata(data):
|
440
|
+
if log_hint:
|
441
|
+
logger.hint(
|
442
|
+
"This is AnnDataLike, consider using File.from_anndata() to link"
|
443
|
+
" var_names and obs.columns as features!"
|
444
|
+
)
|
445
|
+
kwargs["accessor"] = "AnnData"
|
446
|
+
elif data_is_mudata(data):
|
447
|
+
kwargs["accessor"] = "MuData"
|
426
448
|
|
427
449
|
kwargs["id"] = provisional_id
|
428
450
|
kwargs["description"] = description
|
@@ -468,7 +490,7 @@ def from_df(
|
|
468
490
|
"""{}"""
|
469
491
|
file = File(data=df, key=key, run=run, description=description, log_hint=False)
|
470
492
|
feature_set = FeatureSet.from_df(df)
|
471
|
-
file._feature_sets =
|
493
|
+
file._feature_sets = {"columns": feature_set}
|
472
494
|
return file
|
473
495
|
|
474
496
|
|
@@ -497,19 +519,22 @@ def from_anndata(
|
|
497
519
|
type = "float"
|
498
520
|
else:
|
499
521
|
type = convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
|
500
|
-
feature_sets =
|
501
|
-
logger.info("Parsing
|
522
|
+
feature_sets = {}
|
523
|
+
logger.info("Parsing feature names of X, stored in slot .var")
|
502
524
|
logger.indent = " "
|
503
525
|
feature_set_x = FeatureSet.from_values(
|
504
|
-
data_parse.var.index,
|
526
|
+
data_parse.var.index,
|
527
|
+
var_ref,
|
528
|
+
type=type,
|
505
529
|
)
|
506
|
-
feature_sets
|
507
|
-
logger.indent = ""
|
508
|
-
logger.info("Parsing features of obs (numerical & categorical)")
|
509
|
-
logger.indent = " "
|
510
|
-
feature_set_obs = FeatureSet.from_df(data_parse.obs, name="obs")
|
511
|
-
feature_sets.append(feature_set_obs)
|
530
|
+
feature_sets["var"] = feature_set_x
|
512
531
|
logger.indent = ""
|
532
|
+
if len(data_parse.obs.columns) > 0:
|
533
|
+
logger.info("Parsing feature names of slot .obs")
|
534
|
+
logger.indent = " "
|
535
|
+
feature_set_obs = FeatureSet.from_df(data_parse.obs)
|
536
|
+
feature_sets["obs"] = feature_set_obs
|
537
|
+
logger.indent = ""
|
513
538
|
file._feature_sets = feature_sets
|
514
539
|
return file
|
515
540
|
|
@@ -521,19 +546,13 @@ def from_dir(
|
|
521
546
|
path: PathLike,
|
522
547
|
*,
|
523
548
|
run: Optional[Run] = None,
|
549
|
+
storage_root: Optional[PathLike] = None,
|
524
550
|
) -> List["File"]:
|
525
551
|
"""{}"""
|
526
552
|
folderpath = UPath(path)
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
folder_key = get_relative_path_to_root(path=folderpath).as_posix()
|
531
|
-
else:
|
532
|
-
raise RuntimeError(
|
533
|
-
"Currently, only directories in default storage can be registered!\n"
|
534
|
-
"You can either move your folder into the current default storage"
|
535
|
-
"or add a new default storage through `ln.settings.storage`"
|
536
|
-
)
|
553
|
+
folder_key = get_relative_path_to_root(
|
554
|
+
path=folderpath, root=storage_root
|
555
|
+
).as_posix()
|
537
556
|
# always sanitize by stripping a trailing slash
|
538
557
|
folder_key = folder_key.rstrip("/")
|
539
558
|
logger.hint(f"using storage prefix = {folder_key}/")
|
@@ -634,9 +653,14 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
|
634
653
|
# avoid cycles (a file is both input and output)
|
635
654
|
if file.run != context.run:
|
636
655
|
if settings.track_run_inputs:
|
656
|
+
transform_note = ""
|
657
|
+
if file.transform is not None:
|
658
|
+
transform_note = (
|
659
|
+
f", adding parent transform {file.transform.id}"
|
660
|
+
)
|
637
661
|
logger.info(
|
638
|
-
f"Adding file {file.id} as input for run
|
639
|
-
f"
|
662
|
+
f"Adding file {file.id} as input for run"
|
663
|
+
f" {context.run.id}{transform_note}"
|
640
664
|
)
|
641
665
|
track_run_input = True
|
642
666
|
else:
|
@@ -716,14 +740,21 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
|
|
716
740
|
if file.run is not None:
|
717
741
|
file.run.save()
|
718
742
|
if hasattr(file, "_feature_sets"):
|
719
|
-
for feature_set in file._feature_sets:
|
743
|
+
for feature_set in file._feature_sets.values():
|
720
744
|
feature_set.save()
|
721
|
-
if hasattr(file, "_feature_values"):
|
722
|
-
for feature_value in file._feature_values:
|
723
|
-
feature_value.save()
|
724
745
|
super(File, file).save(*args, **kwargs)
|
725
746
|
if hasattr(file, "_feature_sets"):
|
726
|
-
|
747
|
+
links = []
|
748
|
+
for slot, feature_set in file._feature_sets.items():
|
749
|
+
links.append(
|
750
|
+
File.feature_sets.through(
|
751
|
+
file_id=file.id, feature_set_id=feature_set.id, slot=slot
|
752
|
+
)
|
753
|
+
)
|
754
|
+
|
755
|
+
from lamindb._save import bulk_create
|
756
|
+
|
757
|
+
bulk_create(links)
|
727
758
|
|
728
759
|
|
729
760
|
def path(self) -> Union[Path, UPath]:
|
@@ -820,6 +851,9 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
820
851
|
else:
|
821
852
|
raise KeyError(f"No many-to-many relationship is found with '{field}'")
|
822
853
|
|
854
|
+
if None in related_names:
|
855
|
+
related_names.remove(None)
|
856
|
+
|
823
857
|
inherit_names = [
|
824
858
|
related_name
|
825
859
|
for related_name in related_names
|
@@ -834,6 +868,15 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
834
868
|
)
|
835
869
|
|
836
870
|
|
871
|
+
@property # type: ignore
|
872
|
+
@doc_args(File.features.__doc__)
|
873
|
+
def features(self) -> "FeatureManager":
|
874
|
+
"""{}"""
|
875
|
+
from lamindb._feature_manager import FeatureManager
|
876
|
+
|
877
|
+
return FeatureManager(self)
|
878
|
+
|
879
|
+
|
837
880
|
METHOD_NAMES = [
|
838
881
|
"__init__",
|
839
882
|
"from_anndata",
|
@@ -864,5 +907,8 @@ for name in METHOD_NAMES:
|
|
864
907
|
# privates currently dealt with separately
|
865
908
|
File._delete_skip_storage = _delete_skip_storage
|
866
909
|
File._save_skip_storage = _save_skip_storage
|
910
|
+
# TODO: move these to METHOD_NAMES
|
867
911
|
setattr(File, "view_lineage", view_lineage)
|
868
912
|
setattr(File, "inherit_relations", inherit_relations)
|
913
|
+
# property signature is not tested:
|
914
|
+
setattr(File, "features", features)
|