lamindb 0.45a1__py3-none-any.whl → 0.46a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +30 -9
- lamindb/_context.py +11 -12
- lamindb/_dataset.py +142 -0
- lamindb/_delete.py +6 -6
- lamindb/_feature_set.py +138 -0
- lamindb/_file.py +322 -81
- lamindb/_from_values.py +57 -160
- lamindb/_orm.py +398 -0
- lamindb/_save.py +26 -10
- lamindb/_select.py +3 -3
- lamindb/_view.py +2 -2
- lamindb/dev/__init__.py +2 -2
- lamindb/dev/_settings.py +2 -1
- lamindb/dev/datasets/__init__.py +6 -0
- lamindb/dev/datasets/_core.py +30 -0
- lamindb/dev/hashing.py +4 -0
- lamindb/dev/storage/__init__.py +4 -3
- lamindb/dev/storage/_backed_access.py +3 -3
- lamindb/dev/storage/{_file.py → file.py} +48 -3
- lamindb/dev/storage/{_object.py → object.py} +1 -0
- lamindb/dev/utils.py +9 -0
- lamindb/types.py +9 -1
- {lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/METADATA +20 -17
- lamindb-0.46a1.dist-info/RECORD +36 -0
- lamindb/_baseorm_methods.py +0 -535
- lamindb/_featureset_methods.py +0 -73
- lamindb/_file_access.py +0 -48
- lamindb/_file_methods.py +0 -319
- lamindb-0.45a1.dist-info/RECORD +0 -36
- /lamindb/{_transform_methods.py → _transform.py} +0 -0
- {lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/LICENSE +0 -0
- {lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/WHEEL +0 -0
- {lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/entry_points.txt +0 -0
lamindb/_from_values.py
CHANGED
@@ -1,20 +1,17 @@
|
|
1
|
-
from typing import Any, Dict, Iterable, List,
|
1
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
2
2
|
|
3
|
-
import numpy as np
|
4
3
|
import pandas as pd
|
5
|
-
from django.
|
4
|
+
from django.core.exceptions import FieldDoesNotExist
|
6
5
|
from django.db.models.query_utils import DeferredAttribute as Field
|
7
6
|
from lamin_logger import colors, logger
|
8
|
-
from
|
9
|
-
from lnschema_core.
|
7
|
+
from lnschema_core.models import ORM
|
8
|
+
from lnschema_core.types import ListLike
|
10
9
|
|
11
10
|
from ._select import select
|
12
11
|
from .dev._settings import settings
|
13
12
|
|
14
|
-
ListLike = TypeVar("ListLike", pd.Series, list, np.array)
|
15
13
|
|
16
|
-
|
17
|
-
# The base function for `from_iter` and `from_bionty`
|
14
|
+
# The base function for `from_values`
|
18
15
|
def get_or_create_records(
|
19
16
|
iterable: ListLike,
|
20
17
|
field: Field,
|
@@ -47,7 +44,7 @@ def get_or_create_records(
|
|
47
44
|
if len(unmapped_values) > 0:
|
48
45
|
for i in unmapped_values:
|
49
46
|
records.append(model(**{field_name: i}, **kwargs))
|
50
|
-
logger.
|
47
|
+
logger.info(
|
51
48
|
"Created"
|
52
49
|
f" {colors.red(f'{len(unmapped_values)} {model.__name__} records')}"
|
53
50
|
f" with a single field {colors.red(f'{field_name}')}"
|
@@ -57,103 +54,20 @@ def get_or_create_records(
|
|
57
54
|
settings.upon_create_search_names = upon_create_search_names
|
58
55
|
|
59
56
|
|
60
|
-
@deprecated("ORM.from_iter()")
|
61
|
-
def parse(
|
62
|
-
iterable: Union[ListLike, pd.DataFrame],
|
63
|
-
field: Union[Field, Dict[str, Field]],
|
64
|
-
*,
|
65
|
-
species: Optional[str] = None,
|
66
|
-
) -> List[BaseORM]:
|
67
|
-
"""Parse identifiers and create records through lookups for a given field.
|
68
|
-
|
69
|
-
Guide: :doc:`/biology/registries`.
|
70
|
-
|
71
|
-
Args:
|
72
|
-
iterable: `Union[ListLike, pd.DataFrame]` A `ListLike` of identifiers or
|
73
|
-
a `DataFrame`.
|
74
|
-
field: `Union[Field, Dict[str, Field]]` If `iterable` is `ListLike`, a
|
75
|
-
`BaseORM` field to look up.
|
76
|
-
If `iterable` is `DataFrame`, a dict of `{column_name1: field1,
|
77
|
-
column_name2: field2}`.
|
78
|
-
species: `Optional[str]` Either `"human"`, `"mouse"`, or any other
|
79
|
-
`name` of `Bionty.Species`. If `None`, will use default species in
|
80
|
-
bionty for each entity.
|
81
|
-
|
82
|
-
Returns:
|
83
|
-
A list of records.
|
84
|
-
|
85
|
-
For every `value` in an iterable of identifiers and a given `ORM.field`,
|
86
|
-
this function performs:
|
87
|
-
|
88
|
-
1. It checks whether the value already exists in the database
|
89
|
-
(`ORM.select(field=value)`). If so, it adds the queried record to
|
90
|
-
the returned list and skips step 2. Otherwise, proceed with 2.
|
91
|
-
2. If the `ORM` is from `lnschema_bionty`, it checks whether there is an
|
92
|
-
exact match in the underlying ontology (`Bionty.inspect(value, field)`).
|
93
|
-
If so, it creates a record from Bionty and adds it to the returned list.
|
94
|
-
Otherwise, it create a record that populates a single field using `value`
|
95
|
-
and adds the record to the returned list.
|
96
|
-
|
97
|
-
"""
|
98
|
-
upon_create_search_names = settings.upon_create_search_names
|
99
|
-
settings.upon_create_search_names = False
|
100
|
-
try:
|
101
|
-
if isinstance(iterable, pd.DataFrame):
|
102
|
-
# check the field must be a dictionary
|
103
|
-
if not isinstance(field, dict):
|
104
|
-
raise TypeError("field must be a dictionary of {column_name: Field}!")
|
105
|
-
|
106
|
-
# check only one single model class is passed
|
107
|
-
class_mapper = {f.field.name: f.field.model for f in field.values()}
|
108
|
-
if len(set(class_mapper.values())) > 1:
|
109
|
-
raise NotImplementedError("fields must from the same entity!")
|
110
|
-
model = list(class_mapper.values())[0]
|
111
|
-
|
112
|
-
df = _map_columns_to_fields(df=iterable, field=field)
|
113
|
-
df_records = df.to_dict(orient="records")
|
114
|
-
|
115
|
-
# make sure to only return 1 existing entry for each row
|
116
|
-
queryset = get_existing_records_multifields(
|
117
|
-
df_records=df_records, model=model
|
118
|
-
)
|
119
|
-
records = queryset.list()
|
120
|
-
df_records_new = [
|
121
|
-
i for i in df_records if not queryset.filter(**i).exists()
|
122
|
-
]
|
123
|
-
|
124
|
-
if len(records) > 0:
|
125
|
-
logger.hint(
|
126
|
-
"Returned"
|
127
|
-
f" {colors.green(f'{len(records)} existing {model.__name__} DB records')}" # noqa
|
128
|
-
)
|
129
|
-
if len(df_records_new) > 0:
|
130
|
-
logger.hint(
|
131
|
-
"Created"
|
132
|
-
f" {colors.purple(f'{len(df_records_new)} {model.__name__} records')} with" # noqa
|
133
|
-
f" {df.shape[1]} fields"
|
134
|
-
)
|
135
|
-
records += [model(**i) for i in df_records_new]
|
136
|
-
return records
|
137
|
-
else:
|
138
|
-
if not isinstance(field, Field):
|
139
|
-
raise TypeError("field must be an ORM field, e.g., `CellType.name`!")
|
140
|
-
return get_or_create_records(
|
141
|
-
iterable=iterable, field=field, species=species
|
142
|
-
)
|
143
|
-
finally:
|
144
|
-
settings.upon_create_search_names = upon_create_search_names
|
145
|
-
|
146
|
-
|
147
|
-
def index_iterable(iterable: Iterable) -> pd.Index:
|
148
|
-
idx = pd.Index(iterable).unique()
|
149
|
-
# No entries are made for NAs, '', None
|
150
|
-
# returns an ordered unique not null list
|
151
|
-
return idx[(idx != "") & (~idx.isnull())]
|
152
|
-
|
153
|
-
|
154
57
|
def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}):
|
155
58
|
field_name = field.field.name
|
156
59
|
model = field.field.model
|
60
|
+
condition: Dict = {}
|
61
|
+
|
62
|
+
if _has_species_field(model):
|
63
|
+
from lnschema_bionty._bionty import create_or_get_species_record
|
64
|
+
|
65
|
+
species_record = create_or_get_species_record(
|
66
|
+
species=kwargs.get("species"), orm=model
|
67
|
+
)
|
68
|
+
if species_record is not None:
|
69
|
+
kwargs.update({"species": species_record})
|
70
|
+
condition.update({"species__name": species_record.name})
|
157
71
|
|
158
72
|
# map synonyms based on the DB reference
|
159
73
|
syn_mapper = model.map_synonyms(
|
@@ -173,22 +87,21 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
173
87
|
# if necessary, create records for the values in kwargs
|
174
88
|
# k:v -> k:v_record
|
175
89
|
# kwargs is used to deal with species
|
176
|
-
condition
|
177
|
-
kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs, condition=condition)
|
90
|
+
condition.update({f"{field_name}__in": iterable_idx.values})
|
178
91
|
|
179
92
|
stmt = select(model, **condition)
|
180
93
|
|
181
94
|
records = stmt.list() # existing records
|
182
95
|
n_name = len(records) - len(syn_mapper)
|
183
96
|
if n_name > 0:
|
184
|
-
logger.
|
97
|
+
logger.info(
|
185
98
|
"Returned"
|
186
99
|
f" {colors.green(f'{n_name} existing {model.__name__} DB records')} that"
|
187
100
|
f" matched {colors.green(f'{field_name}')} field"
|
188
101
|
)
|
189
102
|
# make sure that synonyms logging appears after the field logging
|
190
103
|
if len(syn_msg) > 0:
|
191
|
-
logger.
|
104
|
+
logger.info(syn_msg)
|
192
105
|
|
193
106
|
existing_values = iterable_idx.intersection(stmt.values_list(field_name, flat=True))
|
194
107
|
nonexist_values = iterable_idx.difference(existing_values)
|
@@ -196,33 +109,6 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
196
109
|
return records, nonexist_values
|
197
110
|
|
198
111
|
|
199
|
-
def get_existing_records_multifields(
|
200
|
-
df_records: List, model: BaseORM, kwargs: Dict = {}
|
201
|
-
):
|
202
|
-
q = Q(**df_records[0])
|
203
|
-
for df_record in df_records[1:]:
|
204
|
-
q = q.__getattribute__("__or__")(Q(**df_record))
|
205
|
-
|
206
|
-
kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs)
|
207
|
-
stmt = model.select(**condition).filter(q)
|
208
|
-
return stmt
|
209
|
-
|
210
|
-
|
211
|
-
def _species_kwargs(orm: BaseORM, kwargs: Dict = {}, condition: Dict = {}):
|
212
|
-
"""Create records based on the kwargs."""
|
213
|
-
if kwargs.get("species") is not None:
|
214
|
-
from lnschema_bionty._bionty import create_or_get_species_record
|
215
|
-
|
216
|
-
species_record = create_or_get_species_record(
|
217
|
-
species=kwargs.get("species"), orm=orm
|
218
|
-
)
|
219
|
-
if species_record is not None:
|
220
|
-
kwargs.update({"species": species_record})
|
221
|
-
condition.update({"species__name": species_record.name})
|
222
|
-
|
223
|
-
return kwargs, condition
|
224
|
-
|
225
|
-
|
226
112
|
def create_records_from_bionty(
|
227
113
|
iterable_idx: pd.Index,
|
228
114
|
field: Field,
|
@@ -232,10 +118,10 @@ def create_records_from_bionty(
|
|
232
118
|
field_name = field.field.name
|
233
119
|
records: List = []
|
234
120
|
# populate additional fields from bionty
|
235
|
-
from lnschema_bionty._bionty import
|
121
|
+
from lnschema_bionty._bionty import get_bionty_source_record
|
236
122
|
|
237
123
|
# create the corresponding bionty object from model
|
238
|
-
bionty_object =
|
124
|
+
bionty_object = model.bionty(species=kwargs.get("species"))
|
239
125
|
# add bionty_source record to the kwargs
|
240
126
|
kwargs.update({"bionty_source": get_bionty_source_record(bionty_object)})
|
241
127
|
|
@@ -261,7 +147,7 @@ def create_records_from_bionty(
|
|
261
147
|
mapped_values = iterable_idx.intersection(bionty_df[field_name])
|
262
148
|
|
263
149
|
if len(mapped_values) > 0:
|
264
|
-
bionty_kwargs = _bulk_create_dicts_from_df(
|
150
|
+
bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
265
151
|
keys=mapped_values, column_name=field_name, df=bionty_df
|
266
152
|
)
|
267
153
|
for bk in bionty_kwargs:
|
@@ -282,46 +168,57 @@ def create_records_from_bionty(
|
|
282
168
|
f" {colors.purple(f'{n_name} {model.__name__} records from Bionty')} that" # noqa
|
283
169
|
f" matched {colors.purple(f'{field_name}')} field"
|
284
170
|
)
|
285
|
-
logger.
|
171
|
+
logger.info(msg + source_msg)
|
286
172
|
# make sure that synonyms logging appears after the field logging
|
287
173
|
if len(msg_syn) > 0:
|
288
|
-
logger.
|
174
|
+
logger.info(msg_syn + source_msg)
|
175
|
+
# warning about multi matches
|
176
|
+
if len(multi_msg) > 0:
|
177
|
+
logger.warning(multi_msg)
|
289
178
|
|
290
179
|
# return the values that are not found in the bionty reference
|
291
180
|
unmapped_values = iterable_idx.difference(mapped_values)
|
292
181
|
return records, unmapped_values
|
293
182
|
|
294
183
|
|
295
|
-
def
|
184
|
+
def index_iterable(iterable: Iterable) -> pd.Index:
|
185
|
+
idx = pd.Index(iterable).unique()
|
186
|
+
# No entries are made for NAs, '', None
|
187
|
+
# returns an ordered unique not null list
|
188
|
+
return idx[(idx != "") & (~idx.isnull())]
|
189
|
+
|
190
|
+
|
191
|
+
def _filter_bionty_df_columns(model: ORM, bionty_object: Any) -> pd.DataFrame:
|
296
192
|
bionty_df = pd.DataFrame()
|
297
193
|
if bionty_object is not None:
|
298
194
|
model_field_names = {i.name for i in model._meta.fields}
|
195
|
+
# parents needs to be added here as relationships aren't in fields
|
196
|
+
model_field_names.add("parents")
|
299
197
|
bionty_df = bionty_object.df().reset_index()
|
198
|
+
# rename definition to description for the lnschema_bionty
|
199
|
+
bionty_df.rename(columns={"definition": "description"}, inplace=True)
|
300
200
|
bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
|
301
201
|
return bionty_df
|
302
202
|
|
303
203
|
|
304
204
|
def _bulk_create_dicts_from_df(
|
305
205
|
keys: Union[set, List], column_name: str, df: pd.DataFrame
|
306
|
-
) ->
|
206
|
+
) -> Tuple[Dict, str]:
|
307
207
|
"""Get fields from a DataFrame for many rows."""
|
208
|
+
multi_msg = ""
|
308
209
|
if df.index.name != column_name:
|
309
|
-
df = df.set_index(column_name)
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
df = df.dropna().drop_duplicates()
|
325
|
-
# TODO: remove after having the auto conversion for django ORMs
|
326
|
-
df = df.mask(df == "", None)
|
327
|
-
return df
|
210
|
+
df = df.set_index(column_name).loc[list(keys)]
|
211
|
+
if not df.index.is_unique:
|
212
|
+
# return all records for multi-matches with a warning
|
213
|
+
dup = df.index[df.index.duplicated()].unique().tolist()
|
214
|
+
multi_msg = f"Multiple matches found in Bionty for: {dup}"
|
215
|
+
|
216
|
+
return df.reset_index().to_dict(orient="records"), multi_msg
|
217
|
+
|
218
|
+
|
219
|
+
def _has_species_field(orm: ORM) -> bool:
|
220
|
+
try:
|
221
|
+
orm._meta.get_field("species")
|
222
|
+
return True
|
223
|
+
except FieldDoesNotExist:
|
224
|
+
return False
|