lamindb 0.45.0__py3-none-any.whl → 0.46a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_from_values.py CHANGED
@@ -1,20 +1,17 @@
1
- from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
1
+ from typing import Any, Dict, Iterable, List, Tuple, Union
2
2
 
3
- import numpy as np
4
3
  import pandas as pd
5
- from django.db.models import Q
4
+ from django.core.exceptions import FieldDoesNotExist
6
5
  from django.db.models.query_utils import DeferredAttribute as Field
7
6
  from lamin_logger import colors, logger
8
- from lamindb_setup.dev import deprecated
9
- from lnschema_core.models import BaseORM
7
+ from lnschema_core.models import ORM
8
+ from lnschema_core.types import ListLike
10
9
 
11
10
  from ._select import select
12
11
  from .dev._settings import settings
13
12
 
14
- ListLike = TypeVar("ListLike", pd.Series, list, np.array)
15
13
 
16
-
17
- # The base function for `from_iter` and `from_bionty`
14
+ # The base function for `from_values`
18
15
  def get_or_create_records(
19
16
  iterable: ListLike,
20
17
  field: Field,
@@ -47,7 +44,7 @@ def get_or_create_records(
47
44
  if len(unmapped_values) > 0:
48
45
  for i in unmapped_values:
49
46
  records.append(model(**{field_name: i}, **kwargs))
50
- logger.hint(
47
+ logger.info(
51
48
  "Created"
52
49
  f" {colors.red(f'{len(unmapped_values)} {model.__name__} records')}"
53
50
  f" with a single field {colors.red(f'{field_name}')}"
@@ -57,103 +54,20 @@ def get_or_create_records(
57
54
  settings.upon_create_search_names = upon_create_search_names
58
55
 
59
56
 
60
- @deprecated("ORM.from_iter()")
61
- def parse(
62
- iterable: Union[ListLike, pd.DataFrame],
63
- field: Union[Field, Dict[str, Field]],
64
- *,
65
- species: Optional[str] = None,
66
- ) -> List[BaseORM]:
67
- """Parse identifiers and create records through lookups for a given field.
68
-
69
- Guide: :doc:`/biology/registries`.
70
-
71
- Args:
72
- iterable: `Union[ListLike, pd.DataFrame]` A `ListLike` of identifiers or
73
- a `DataFrame`.
74
- field: `Union[Field, Dict[str, Field]]` If `iterable` is `ListLike`, a
75
- `BaseORM` field to look up.
76
- If `iterable` is `DataFrame`, a dict of `{column_name1: field1,
77
- column_name2: field2}`.
78
- species: `Optional[str]` Either `"human"`, `"mouse"`, or any other
79
- `name` of `Bionty.Species`. If `None`, will use default species in
80
- bionty for each entity.
81
-
82
- Returns:
83
- A list of records.
84
-
85
- For every `value` in an iterable of identifiers and a given `ORM.field`,
86
- this function performs:
87
-
88
- 1. It checks whether the value already exists in the database
89
- (`ORM.select(field=value)`). If so, it adds the queried record to
90
- the returned list and skips step 2. Otherwise, proceed with 2.
91
- 2. If the `ORM` is from `lnschema_bionty`, it checks whether there is an
92
- exact match in the underlying ontology (`Bionty.inspect(value, field)`).
93
- If so, it creates a record from Bionty and adds it to the returned list.
94
- Otherwise, it create a record that populates a single field using `value`
95
- and adds the record to the returned list.
96
-
97
- """
98
- upon_create_search_names = settings.upon_create_search_names
99
- settings.upon_create_search_names = False
100
- try:
101
- if isinstance(iterable, pd.DataFrame):
102
- # check the field must be a dictionary
103
- if not isinstance(field, dict):
104
- raise TypeError("field must be a dictionary of {column_name: Field}!")
105
-
106
- # check only one single model class is passed
107
- class_mapper = {f.field.name: f.field.model for f in field.values()}
108
- if len(set(class_mapper.values())) > 1:
109
- raise NotImplementedError("fields must from the same entity!")
110
- model = list(class_mapper.values())[0]
111
-
112
- df = _map_columns_to_fields(df=iterable, field=field)
113
- df_records = df.to_dict(orient="records")
114
-
115
- # make sure to only return 1 existing entry for each row
116
- queryset = get_existing_records_multifields(
117
- df_records=df_records, model=model
118
- )
119
- records = queryset.list()
120
- df_records_new = [
121
- i for i in df_records if not queryset.filter(**i).exists()
122
- ]
123
-
124
- if len(records) > 0:
125
- logger.hint(
126
- "Returned"
127
- f" {colors.green(f'{len(records)} existing {model.__name__} DB records')}" # noqa
128
- )
129
- if len(df_records_new) > 0:
130
- logger.hint(
131
- "Created"
132
- f" {colors.purple(f'{len(df_records_new)} {model.__name__} records')} with" # noqa
133
- f" {df.shape[1]} fields"
134
- )
135
- records += [model(**i) for i in df_records_new]
136
- return records
137
- else:
138
- if not isinstance(field, Field):
139
- raise TypeError("field must be an ORM field, e.g., `CellType.name`!")
140
- return get_or_create_records(
141
- iterable=iterable, field=field, species=species
142
- )
143
- finally:
144
- settings.upon_create_search_names = upon_create_search_names
145
-
146
-
147
- def index_iterable(iterable: Iterable) -> pd.Index:
148
- idx = pd.Index(iterable).unique()
149
- # No entries are made for NAs, '', None
150
- # returns an ordered unique not null list
151
- return idx[(idx != "") & (~idx.isnull())]
152
-
153
-
154
57
  def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}):
155
58
  field_name = field.field.name
156
59
  model = field.field.model
60
+ condition: Dict = {}
61
+
62
+ if _has_species_field(model):
63
+ from lnschema_bionty._bionty import create_or_get_species_record
64
+
65
+ species_record = create_or_get_species_record(
66
+ species=kwargs.get("species"), orm=model
67
+ )
68
+ if species_record is not None:
69
+ kwargs.update({"species": species_record})
70
+ condition.update({"species__name": species_record.name})
157
71
 
158
72
  # map synonyms based on the DB reference
159
73
  syn_mapper = model.map_synonyms(
@@ -173,22 +87,21 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
173
87
  # if necessary, create records for the values in kwargs
174
88
  # k:v -> k:v_record
175
89
  # kwargs is used to deal with species
176
- condition = {f"{field_name}__in": iterable_idx.values}
177
- kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs, condition=condition)
90
+ condition.update({f"{field_name}__in": iterable_idx.values})
178
91
 
179
92
  stmt = select(model, **condition)
180
93
 
181
94
  records = stmt.list() # existing records
182
95
  n_name = len(records) - len(syn_mapper)
183
96
  if n_name > 0:
184
- logger.hint(
97
+ logger.info(
185
98
  "Returned"
186
99
  f" {colors.green(f'{n_name} existing {model.__name__} DB records')} that"
187
100
  f" matched {colors.green(f'{field_name}')} field"
188
101
  )
189
102
  # make sure that synonyms logging appears after the field logging
190
103
  if len(syn_msg) > 0:
191
- logger.hint(syn_msg)
104
+ logger.info(syn_msg)
192
105
 
193
106
  existing_values = iterable_idx.intersection(stmt.values_list(field_name, flat=True))
194
107
  nonexist_values = iterable_idx.difference(existing_values)
@@ -196,33 +109,6 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
196
109
  return records, nonexist_values
197
110
 
198
111
 
199
- def get_existing_records_multifields(
200
- df_records: List, model: BaseORM, kwargs: Dict = {}
201
- ):
202
- q = Q(**df_records[0])
203
- for df_record in df_records[1:]:
204
- q = q.__getattribute__("__or__")(Q(**df_record))
205
-
206
- kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs)
207
- stmt = model.select(**condition).filter(q)
208
- return stmt
209
-
210
-
211
- def _species_kwargs(orm: BaseORM, kwargs: Dict = {}, condition: Dict = {}):
212
- """Create records based on the kwargs."""
213
- if kwargs.get("species") is not None:
214
- from lnschema_bionty._bionty import create_or_get_species_record
215
-
216
- species_record = create_or_get_species_record(
217
- species=kwargs.get("species"), orm=orm
218
- )
219
- if species_record is not None:
220
- kwargs.update({"species": species_record})
221
- condition.update({"species__name": species_record.name})
222
-
223
- return kwargs, condition
224
-
225
-
226
112
  def create_records_from_bionty(
227
113
  iterable_idx: pd.Index,
228
114
  field: Field,
@@ -232,10 +118,10 @@ def create_records_from_bionty(
232
118
  field_name = field.field.name
233
119
  records: List = []
234
120
  # populate additional fields from bionty
235
- from lnschema_bionty._bionty import get_bionty_object, get_bionty_source_record
121
+ from lnschema_bionty._bionty import get_bionty_source_record
236
122
 
237
123
  # create the corresponding bionty object from model
238
- bionty_object = get_bionty_object(orm=model, species=kwargs.get("species"))
124
+ bionty_object = model.bionty(species=kwargs.get("species"))
239
125
  # add bionty_source record to the kwargs
240
126
  kwargs.update({"bionty_source": get_bionty_source_record(bionty_object)})
241
127
 
@@ -261,7 +147,7 @@ def create_records_from_bionty(
261
147
  mapped_values = iterable_idx.intersection(bionty_df[field_name])
262
148
 
263
149
  if len(mapped_values) > 0:
264
- bionty_kwargs = _bulk_create_dicts_from_df(
150
+ bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
265
151
  keys=mapped_values, column_name=field_name, df=bionty_df
266
152
  )
267
153
  for bk in bionty_kwargs:
@@ -282,46 +168,57 @@ def create_records_from_bionty(
282
168
  f" {colors.purple(f'{n_name} {model.__name__} records from Bionty')} that" # noqa
283
169
  f" matched {colors.purple(f'{field_name}')} field"
284
170
  )
285
- logger.hint(msg + source_msg)
171
+ logger.info(msg + source_msg)
286
172
  # make sure that synonyms logging appears after the field logging
287
173
  if len(msg_syn) > 0:
288
- logger.hint(msg_syn + source_msg)
174
+ logger.info(msg_syn + source_msg)
175
+ # warning about multi matches
176
+ if len(multi_msg) > 0:
177
+ logger.warning(multi_msg)
289
178
 
290
179
  # return the values that are not found in the bionty reference
291
180
  unmapped_values = iterable_idx.difference(mapped_values)
292
181
  return records, unmapped_values
293
182
 
294
183
 
295
- def _filter_bionty_df_columns(model: BaseORM, bionty_object: Any) -> pd.DataFrame:
184
+ def index_iterable(iterable: Iterable) -> pd.Index:
185
+ idx = pd.Index(iterable).unique()
186
+ # No entries are made for NAs, '', None
187
+ # returns an ordered unique not null list
188
+ return idx[(idx != "") & (~idx.isnull())]
189
+
190
+
191
+ def _filter_bionty_df_columns(model: ORM, bionty_object: Any) -> pd.DataFrame:
296
192
  bionty_df = pd.DataFrame()
297
193
  if bionty_object is not None:
298
194
  model_field_names = {i.name for i in model._meta.fields}
195
+ # parents needs to be added here as relationships aren't in fields
196
+ model_field_names.add("parents")
299
197
  bionty_df = bionty_object.df().reset_index()
198
+ # rename definition to description for the lnschema_bionty
199
+ bionty_df.rename(columns={"definition": "description"}, inplace=True)
300
200
  bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
301
201
  return bionty_df
302
202
 
303
203
 
304
204
  def _bulk_create_dicts_from_df(
305
205
  keys: Union[set, List], column_name: str, df: pd.DataFrame
306
- ) -> dict:
206
+ ) -> Tuple[Dict, str]:
307
207
  """Get fields from a DataFrame for many rows."""
208
+ multi_msg = ""
308
209
  if df.index.name != column_name:
309
- df = df.set_index(column_name)
310
- # keep the last record (assuming most recent) if duplicated
311
- df = df[~df.index.duplicated(keep="last")]
312
- return df.loc[list(keys)].reset_index().to_dict(orient="records")
313
-
314
-
315
- def _map_columns_to_fields(df: pd.DataFrame, field: dict) -> pd.DataFrame:
316
- """Subset dataframe to mappable fields columns and clean up."""
317
- column_mapper = {colname: f.field.name for colname, f in field.items()}
318
- # subset to columns containing fields
319
- df = df.copy()
320
- if df.index.name is not None:
321
- df = df.reset_index()
322
- df = df.loc[:, df.columns.isin(field.keys())]
323
- df = df.rename(columns=column_mapper)
324
- df = df.dropna().drop_duplicates()
325
- # TODO: remove after having the auto conversion for django ORMs
326
- df = df.mask(df == "", None)
327
- return df
210
+ df = df.set_index(column_name).loc[list(keys)]
211
+ if not df.index.is_unique:
212
+ # return all records for multi-matches with a warning
213
+ dup = df.index[df.index.duplicated()].unique().tolist()
214
+ multi_msg = f"Multiple matches found in Bionty for: {dup}"
215
+
216
+ return df.reset_index().to_dict(orient="records"), multi_msg
217
+
218
+
219
+ def _has_species_field(orm: ORM) -> bool:
220
+ try:
221
+ orm._meta.get_field("species")
222
+ return True
223
+ except FieldDoesNotExist:
224
+ return False