lamindb 0.69.2__py3-none-any.whl → 0.69.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,265 +0,0 @@
1
- from typing import Dict, List, Optional, Tuple, Union
2
-
3
- import anndata as ad
4
- import pandas as pd
5
- from lamin_utils import colors, logger
6
- from lnschema_core.types import FieldAttr
7
-
8
- import lamindb as ln
9
-
10
- from ._validate import (
11
- check_registry_organism,
12
- get_registry_instance,
13
- standardize_and_inspect,
14
- )
15
-
16
-
17
- def register_artifact(
18
- data: Union[pd.DataFrame, ad.AnnData],
19
- description: str,
20
- fields: Dict[str, FieldAttr],
21
- feature_field: FieldAttr,
22
- **kwargs,
23
- ) -> ln.Artifact:
24
- """Register all metadata with an Artifact.
25
-
26
- Args:
27
- data: The DataFrame or AnnData object to register.
28
- description: A description of the artifact.
29
- fields: A dictionary mapping obs_column to registry_field.
30
- feature_field: The registry field to validate variables index against.
31
- kwargs: Additional keyword arguments to pass to the registry model.
32
-
33
- Returns:
34
- The registered Artifact.
35
- """
36
- if isinstance(data, ad.AnnData):
37
- artifact = ln.Artifact.from_anndata(data, description=description)
38
- artifact.n_observations = data.n_obs
39
- elif isinstance(data, pd.DataFrame):
40
- artifact = ln.Artifact.from_df(data, description=description)
41
- else:
42
- raise ValueError("data must be a DataFrame or AnnData object")
43
- artifact.save()
44
-
45
- feature_kwargs: Dict = {}
46
- organism = check_registry_organism(
47
- feature_field.field.model, kwargs.pop("organism", None)
48
- )
49
- if organism is not None:
50
- feature_kwargs["organism"] = organism
51
-
52
- if isinstance(data, ad.AnnData):
53
- artifact.features.add_from_anndata(var_field=feature_field, **feature_kwargs)
54
- else:
55
- artifact.features.add_from_df(field=feature_field, **feature_kwargs)
56
-
57
- features = ln.Feature.lookup().dict()
58
- for feature_name, field in fields.items():
59
- feature = features.get(feature_name)
60
- registry = field.field.model
61
- filter_kwargs = kwargs.copy()
62
- organism = check_registry_organism(registry, organism)
63
- if organism is not None:
64
- filter_kwargs["organism"] = organism
65
- df = data.obs if isinstance(data, ad.AnnData) else data
66
- labels = registry.from_values(df[feature_name], field=field, **filter_kwargs)
67
- artifact.labels.add(labels, feature)
68
-
69
- slug = ln.setup.settings.instance.slug
70
- logger.success(f"registered artifact in {colors.italic(slug)}")
71
- if ln.setup.settings.instance.is_remote:
72
- logger.info(f"🔗 https://lamin.ai/{slug}/artifact/{artifact.uid}")
73
-
74
- return artifact
75
-
76
-
77
- def register_labels(
78
- values: List[str],
79
- field: FieldAttr,
80
- feature_name: str,
81
- using: Optional[str] = None,
82
- validated_only: bool = True,
83
- kwargs: Optional[Dict] = None,
84
- df: Optional[pd.DataFrame] = None,
85
- ) -> None:
86
- """Register features or labels records in the default instance from the using instance.
87
-
88
- Args:
89
- values: A list of values to be registered as labels.
90
- field: The FieldAttr object representing the field for which labels are being registered.
91
- feature_name: The name of the feature to register.
92
- using: The name of the instance from which to transfer labels (if applicable).
93
- validated_only: If True, only register validated labels.
94
- kwargs: Additional keyword arguments to pass to the registry model.
95
- df: A DataFrame to register labels from.
96
- """
97
- filter_kwargs = {} if kwargs is None else kwargs.copy()
98
- registry = field.field.model
99
- if registry == ln.ULabel:
100
- validated_only = False
101
-
102
- organism = check_registry_organism(registry, filter_kwargs.pop("organism", None))
103
- if organism is not None:
104
- filter_kwargs["organism"] = organism
105
-
106
- verbosity = ln.settings.verbosity
107
- try:
108
- ln.settings.verbosity = "error"
109
- inspect_result_current = standardize_and_inspect(
110
- values=values, field=field, registry=registry, **filter_kwargs
111
- )
112
- if not inspect_result_current.non_validated:
113
- ln.settings.verbosity = verbosity
114
- return
115
-
116
- labels_registered: Dict = {"from public": [], "without reference": []}
117
-
118
- (
119
- labels_registered[f"from {using}"],
120
- non_validated_labels,
121
- ) = register_labels_from_using_instance(
122
- inspect_result_current.non_validated,
123
- field=field,
124
- using=using,
125
- kwargs=filter_kwargs,
126
- )
127
-
128
- public_records = (
129
- registry.from_values(non_validated_labels, field=field, **filter_kwargs)
130
- if non_validated_labels
131
- else []
132
- )
133
- ln.save(public_records)
134
- labels_registered["from public"] = [
135
- getattr(r, field.field.name) for r in public_records
136
- ]
137
- labels_registered["without reference"] = [
138
- i for i in non_validated_labels if i not in labels_registered["from public"]
139
- ]
140
-
141
- if not validated_only:
142
- non_validated_records = []
143
- if df is not None and registry == ln.Feature:
144
- non_validated_records = ln.Feature.from_df(df)
145
- else:
146
- if "organism" in filter_kwargs:
147
- filter_kwargs["organism"] = _register_organism(name=organism)
148
- for value in labels_registered["without reference"]:
149
- filter_kwargs[field.field.name] = value
150
- if registry == ln.Feature:
151
- filter_kwargs["type"] = "category"
152
- non_validated_records.append(registry(**filter_kwargs))
153
- ln.save(non_validated_records)
154
-
155
- if registry == ln.ULabel and field.field.name == "name":
156
- register_ulabels_with_parent(values, field=field, feature_name=feature_name)
157
- finally:
158
- ln.settings.verbosity = verbosity
159
-
160
- log_registered_labels(
161
- labels_registered,
162
- feature_name=feature_name,
163
- model_field=f"{registry.__name__}.{field.field.name}",
164
- validated_only=validated_only,
165
- )
166
-
167
-
168
- def log_registered_labels(
169
- labels_registered: Dict,
170
- feature_name: str,
171
- model_field: str,
172
- validated_only: bool = True,
173
- ) -> None:
174
- """Log the registered labels."""
175
- labels_type = "features" if feature_name == "feature" else "labels"
176
- model_field = colors.italic(model_field)
177
- for key, labels in labels_registered.items():
178
- if not labels:
179
- continue
180
-
181
- if key == "without reference" and validated_only:
182
- msg = colors.yellow(
183
- f"{len(labels)} non-validated {labels_type} are not registered with {model_field}: {labels}!"
184
- )
185
- lookup_print = f".lookup().['{feature_name}']"
186
- msg += f"\n → to lookup categories, use {lookup_print}"
187
- msg += (
188
- f"\n → to register, run {colors.yellow('register_features(validated_only=False)')}"
189
- if labels_type == "features"
190
- else f"\n → to register, set {colors.yellow('validated_only=False')}"
191
- )
192
- logger.warning(msg)
193
- else:
194
- key = "" if key == "without reference" else f"{colors.green(key)} "
195
- logger.success(
196
- f"registered {len(labels)} {labels_type} {key}with {model_field}: {labels}"
197
- )
198
-
199
-
200
- def register_ulabels_with_parent(
201
- values: List[str], field: FieldAttr, feature_name: str
202
- ) -> None:
203
- """Register a parent label for the given labels."""
204
- registry = field.field.model
205
- assert registry == ln.ULabel
206
- all_records = registry.from_values(values, field=field)
207
- is_feature = registry.filter(name=f"is_{feature_name}").one_or_none()
208
- if is_feature is None:
209
- is_feature = registry(name=f"is_{feature_name}")
210
- is_feature.save()
211
- is_feature.children.add(*all_records)
212
-
213
-
214
- def register_labels_from_using_instance(
215
- values: List[str],
216
- field: FieldAttr,
217
- using: Optional[str] = None,
218
- kwargs: Optional[Dict] = None,
219
- ) -> Tuple[List[str], List[str]]:
220
- """Register features or labels records from the using instance.
221
-
222
- Args:
223
- values: A list of values to be registered as labels.
224
- field: The FieldAttr object representing the field for which labels are being registered.
225
- using: The name of the instance from which to transfer labels (if applicable).
226
- kwargs: Additional keyword arguments to pass to the registry model.
227
-
228
- Returns:
229
- A tuple containing the list of registered labels and the list of non-registered labels.
230
- """
231
- kwargs = kwargs or {}
232
- labels_registered = []
233
- not_registered = values
234
-
235
- if using is not None and using != "default":
236
- registry = field.field.model
237
- registry_using = get_registry_instance(registry, using)
238
- inspect_result_using = standardize_and_inspect(
239
- values=values, field=field, registry=registry_using, **kwargs
240
- )
241
- labels_using = registry_using.filter(
242
- **{f"{field.field.name}__in": inspect_result_using.validated}
243
- ).all()
244
- for label_using in labels_using:
245
- label_using.save()
246
- labels_registered.append(getattr(label_using, field.field.name))
247
- not_registered = inspect_result_using.non_validated
248
-
249
- return labels_registered, not_registered
250
-
251
-
252
- def _register_organism(name: str):
253
- """Register an organism record."""
254
- import bionty as bt
255
-
256
- organism = bt.Organism.filter(name=name).one_or_none()
257
- if organism is None:
258
- organism = bt.Organism.from_public(name=name)
259
- if organism is None:
260
- raise ValueError(
261
- f"Organism '{name}' not found\n"
262
- f" → please register it: bt.Organism(name='{name}').save()"
263
- )
264
- organism.save()
265
- return organism
@@ -1,139 +0,0 @@
1
- from typing import Dict, Iterable, Optional
2
-
3
- import pandas as pd
4
- from anndata import AnnData
5
- from lamin_utils import colors, logger
6
- from lnschema_core import Registry
7
- from lnschema_core.types import FieldAttr
8
-
9
- from lamindb._from_values import _print_values
10
-
11
-
12
- def get_registry_instance(registry: Registry, using: Optional[str] = None) -> Registry:
13
- """Get a registry instance using a specific instance."""
14
- if using is not None and using != "default":
15
- return registry.using(using)
16
- return registry
17
-
18
-
19
- def standardize_and_inspect(
20
- values: Iterable[str], field: FieldAttr, registry: Registry, **kwargs
21
- ):
22
- """Standardize and inspect values using a registry."""
23
- if hasattr(registry, "standardize"):
24
- values = registry.standardize(values, field=field, mute=True, **kwargs)
25
- return registry.inspect(values, field=field, mute=True, **kwargs)
26
-
27
-
28
- def check_registry_organism(
29
- registry: Registry, organism: Optional[str] = None
30
- ) -> Optional[str]:
31
- """Check if a registry needs an organism and return the organism name."""
32
- if hasattr(registry, "organism_id"):
33
- import bionty as bt
34
-
35
- if organism is None and bt.settings.organism is None:
36
- raise ValueError(
37
- f"{registry.__name__} registry requires an organism!\n"
38
- " → please pass an organism name via organism="
39
- )
40
- return organism or bt.settings.organism.name
41
- return None
42
-
43
-
44
- def validate_categories(
45
- values: Iterable[str],
46
- field: FieldAttr,
47
- feature_name: str,
48
- using: Optional[str] = None,
49
- **kwargs,
50
- ) -> bool:
51
- """Validate ontology terms in a pandas series using LaminDB registries."""
52
- model_field = f"{field.field.model.__name__}.{field.field.name}"
53
- logger.indent = ""
54
- logger.info(
55
- f"inspecting '{colors.bold(feature_name)}' by {colors.italic(model_field)}"
56
- )
57
- logger.indent = " "
58
-
59
- registry = field.field.model
60
- filter_kwargs = {}
61
- organism = check_registry_organism(registry, kwargs.get("organism"))
62
- if organism is not None:
63
- filter_kwargs["organism"] = organism
64
-
65
- # Inspect the default instance
66
- inspect_result = standardize_and_inspect(
67
- values=values, field=field, registry=registry, **filter_kwargs
68
- )
69
- non_validated = inspect_result.non_validated
70
-
71
- if using is not None and using != "default" and non_validated:
72
- registry = get_registry_instance(registry, using)
73
- # Inspect the using instance
74
- inspect_result = standardize_and_inspect(
75
- values=non_validated, field=field, registry=registry, **filter_kwargs
76
- )
77
- non_validated = inspect_result.non_validated
78
-
79
- n_non_validated = len(non_validated)
80
- if n_non_validated == 0:
81
- logger.success(f"all {feature_name}s are validated")
82
- return True
83
- else:
84
- are = "are" if n_non_validated > 1 else "is"
85
- print_values = _print_values(non_validated)
86
- feature_name_print = f".register_labels('{feature_name}')"
87
- warning_message = (
88
- f"{colors.yellow(f'{n_non_validated} terms')} {are} not validated: "
89
- f"{colors.yellow(print_values)}\n → register terms via "
90
- f"{colors.yellow(feature_name_print)}"
91
- )
92
- logger.warning(warning_message)
93
- logger.indent = ""
94
- return False
95
-
96
-
97
- def validate_categories_in_df(
98
- df: pd.DataFrame,
99
- fields: Dict[str, FieldAttr],
100
- using: Optional[str] = None,
101
- **kwargs,
102
- ) -> bool:
103
- """Validate categories in DataFrame columns using LaminDB registries."""
104
- validated = True
105
- for feature_name, field in fields.items():
106
- validated &= validate_categories(
107
- df[feature_name],
108
- field=field,
109
- feature_name=feature_name,
110
- using=using,
111
- **kwargs,
112
- )
113
- return validated
114
-
115
-
116
- def validate_anndata(
117
- adata: AnnData,
118
- var_field: FieldAttr,
119
- obs_fields: Dict[str, FieldAttr],
120
- using: Optional[str] = None,
121
- **kwargs,
122
- ) -> bool:
123
- """Inspect metadata in an AnnData object using LaminDB registries."""
124
- if using is not None and using != "default":
125
- logger.important(
126
- f"validating metadata using registries of instance {colors.italic(using)}"
127
- )
128
-
129
- validated_var = validate_categories(
130
- adata.var.index,
131
- field=var_field,
132
- feature_name="variables",
133
- using=using,
134
- **kwargs,
135
- )
136
- validated_obs = validate_categories_in_df(
137
- adata.obs, fields=obs_fields, using=using, **kwargs
138
- )
139
- return validated_var and validated_obs
@@ -1,221 +0,0 @@
1
- from typing import Dict, Iterable, Optional
2
-
3
- import pandas as pd
4
- from lamin_utils import colors, logger
5
- from lnschema_core.types import FieldAttr
6
-
7
- import lamindb as ln
8
-
9
- from ._lookup import Lookup
10
- from ._register import register_artifact, register_labels
11
- from ._validate import validate_categories_in_df
12
-
13
-
14
- class ValidationError(ValueError):
15
- """Validation error."""
16
-
17
- pass
18
-
19
-
20
- class Validator:
21
- """Lamin validator.
22
-
23
- Args:
24
- df: The DataFrame object to validate.
25
- fields: A dictionary mapping column to registry_field.
26
- For example:
27
- {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}
28
- feature_field: The field attribute for the feature column.
29
- using: The reference instance containing registries to validate against.
30
- verbosity: The verbosity level.
31
- """
32
-
33
- def __init__(
34
- self,
35
- df: pd.DataFrame,
36
- fields: Optional[Dict[str, FieldAttr]] = None,
37
- feature_field: FieldAttr = ln.Feature.name,
38
- using: Optional[str] = None,
39
- verbosity: str = "hint",
40
- **kwargs,
41
- ) -> None:
42
- """Initialize the Validator."""
43
- self._df = df
44
- self._fields = fields or {}
45
- self._feature_field = feature_field
46
- self._using = using
47
- ln.settings.verbosity = verbosity
48
- self._artifact = None
49
- self._collection = None
50
- self._validated = False
51
- self._kwargs: Dict = kwargs
52
- self.register_features()
53
-
54
- @property
55
- def fields(self) -> Dict:
56
- """Return the columns fields to validate against."""
57
- return self._fields
58
-
59
- def lookup(self, using: Optional[str] = None) -> Lookup:
60
- """Lookup features and labels.
61
-
62
- Args:
63
- using: The instance where the lookup is performed.
64
- if None (default), the lookup is performed on the instance specified in "using" parameter of the Validator.
65
- if "public", the lookup is performed on the public reference.
66
- """
67
- fields = {**{"feature": self._feature_field}, **self.fields}
68
- return Lookup(fields=fields, using=using or self._using)
69
-
70
- def register_features(self, validated_only: bool = True) -> None:
71
- """Register features records."""
72
- missing_columns = set(self.fields.keys()) - set(self._df.columns)
73
- if missing_columns:
74
- raise ValueError(
75
- f"Columns {missing_columns} are not found in the data object!"
76
- )
77
-
78
- # Always register features specified as the fields keys
79
- register_labels(
80
- values=list(self.fields.keys()),
81
- field=self._feature_field,
82
- feature_name="feature",
83
- using=self._using,
84
- validated_only=False,
85
- kwargs=self._kwargs,
86
- )
87
-
88
- # Register the rest of the columns based on validated_only
89
- additional_columns = set(self._df.columns) - set(self.fields.keys())
90
- if additional_columns:
91
- register_labels(
92
- values=list(additional_columns),
93
- field=self._feature_field,
94
- feature_name="feature",
95
- using=self._using,
96
- validated_only=validated_only,
97
- df=self._df, # Get the Feature type from df
98
- kwargs=self._kwargs,
99
- )
100
-
101
- def register_labels(self, feature: str, validated_only: bool = True, **kwargs):
102
- """Register labels for a feature.
103
-
104
- Args:
105
- feature: The name of the feature to register.
106
- validated_only: Whether to register only validated labels.
107
- **kwargs: Additional keyword arguments.
108
- """
109
- if feature == "all":
110
- self._register_labels_all(validated_only=validated_only, **kwargs)
111
- elif feature == "feature":
112
- self.register_features(validated_only=validated_only)
113
- else:
114
- if feature not in self.fields:
115
- raise ValueError(f"Feature {feature} is not part of the fields!")
116
- register_labels(
117
- values=self._df[feature].unique().tolist(),
118
- field=self.fields[feature],
119
- feature_name=feature,
120
- using=self._using,
121
- validated_only=validated_only,
122
- kwargs=kwargs,
123
- )
124
-
125
- def _register_labels_all(self, validated_only: bool = True, **kwargs):
126
- """Register labels for all features."""
127
- for name in self.fields.keys():
128
- logger.info(f"registering labels for '{name}'")
129
- self.register_labels(feature=name, validated_only=validated_only, **kwargs)
130
-
131
- def validate(self, **kwargs) -> bool:
132
- """Validate variables and categorical observations.
133
-
134
- Returns:
135
- Whether the DataFrame is validated.
136
- """
137
- self._kwargs.update(kwargs)
138
- self._validated = validate_categories_in_df(
139
- self._df,
140
- fields=self.fields,
141
- using=self._using,
142
- **self._kwargs,
143
- )
144
- return self._validated
145
-
146
- def register_artifact(self, description: str, **kwargs) -> ln.Artifact:
147
- """Register the validated DataFrame and metadata.
148
-
149
- Args:
150
- description: Description of the DataFrame object.
151
- **kwargs: Object level metadata.
152
-
153
- Returns:
154
- A registered artifact record.
155
- """
156
- self._kwargs.update(kwargs)
157
- if not self._validated:
158
- raise ValidationError(
159
- f"Data object is not validated, please run {colors.yellow('validate()')}!"
160
- )
161
-
162
- # Make sure all labels are registered in the current instance
163
- verbosity = ln.settings.verbosity
164
- try:
165
- ln.settings.verbosity = "warning"
166
- self.register_labels("all")
167
-
168
- self._artifact = register_artifact(
169
- self._df,
170
- description=description,
171
- fields=self.fields,
172
- feature_field=self._feature_field,
173
- **self._kwargs,
174
- )
175
- finally:
176
- ln.settings.verbosity = verbosity
177
-
178
- return self._artifact
179
-
180
- def register_collection(
181
- self,
182
- artifact: ln.Artifact | Iterable[ln.Artifact],
183
- name: str,
184
- description: Optional[str] = None,
185
- reference: Optional[str] = None,
186
- reference_type: Optional[str] = None,
187
- ) -> ln.Collection:
188
- """Register a collection from artifact/artifacts.
189
-
190
- Args:
191
- artifact: One or several registered Artifacts.
192
- name: Title of the publication.
193
- description: Description of the publication.
194
- reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
195
- reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
196
- """
197
- collection = ln.Collection(
198
- artifact,
199
- name=name,
200
- description=description,
201
- reference=reference,
202
- reference_type=reference_type,
203
- )
204
- slug = ln.setup.settings.instance.slug
205
- if collection._state.adding:
206
- collection.save()
207
- logger.success(f"registered collection in {colors.italic(slug)}")
208
- else:
209
- collection.save()
210
- logger.warning(f"collection already exists in {colors.italic(slug)}!")
211
- if ln.setup.settings.instance.is_remote:
212
- logger.print(f"🔗 https://lamin.ai/{slug}/collection/{collection.uid}")
213
- self._collection = collection
214
- return collection
215
-
216
- def clean_up_failed_runs(self):
217
- """Clean up previous failed runs that don't register any outputs."""
218
- if ln.run_context.transform is not None:
219
- ln.Run.filter(
220
- transform=ln.run_context.transform, output_artifacts=None
221
- ).exclude(uid=ln.run_context.run.uid).delete()
File without changes