lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,26 +3,21 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  import pandas as pd
6
- from django.core.exceptions import FieldDoesNotExist
7
6
  from lamin_utils import colors, logger
8
7
 
9
- from .record import Record
10
-
11
8
  if TYPE_CHECKING:
12
- from collections.abc import Iterable
13
-
14
- from lamindb.base.types import ListLike, StrField
9
+ from lamindb.base.types import FieldAttr, ListLike
15
10
 
16
11
  from .query_set import RecordList
12
+ from .record import Record
17
13
 
18
14
 
19
15
  # The base function for `from_values`
20
- def get_or_create_records(
16
+ def _from_values(
21
17
  iterable: ListLike,
22
- field: StrField,
18
+ field: FieldAttr,
23
19
  *,
24
20
  create: bool = False,
25
- from_source: bool = False,
26
21
  organism: Record | str | None = None,
27
22
  source: Record | None = None,
28
23
  mute: bool = False,
@@ -31,68 +26,67 @@ def get_or_create_records(
31
26
  from .query_set import RecordList
32
27
 
33
28
  registry = field.field.model # type: ignore
29
+ organism_record = get_organism_record_from_field(field, organism, values=iterable)
30
+ # TODO: the create is problematic if field is not a name field
34
31
  if create:
35
- return RecordList([registry(**{field.field.name: value}) for value in iterable]) # type: ignore
36
- organism = _get_organism_record(field, organism)
32
+ create_kwargs = {}
33
+ if organism_record:
34
+ create_kwargs["organism"] = organism_record
35
+ return RecordList(
36
+ [
37
+ registry(**{field.field.name: value}, **create_kwargs)
38
+ for value in iterable
39
+ ]
40
+ ) # type: ignore
41
+
37
42
  iterable_idx = index_iterable(iterable)
38
43
 
39
44
  # returns existing records & non-existing values
40
45
  records, nonexist_values, msg = get_existing_records(
41
46
  iterable_idx=iterable_idx,
42
47
  field=field,
43
- organism=organism,
48
+ organism=organism_record,
44
49
  mute=mute,
45
50
  )
46
51
 
47
52
  # new records to be created based on new values
48
53
  if len(nonexist_values) > 0:
49
- source_record = None
50
- if from_source:
51
- if isinstance(source, Record):
52
- source_record = source
53
- if not source_record and hasattr(registry, "public"):
54
- if organism is None:
55
- organism = _ensembl_prefix(nonexist_values[0], field, organism)
56
- organism = _get_organism_record(field, organism, force=True)
57
-
58
- if source_record:
59
- from bionty.core._add_ontology import check_source_in_db
60
-
61
- check_source_in_db(registry=registry, source=source_record)
62
-
63
- from_source = not source_record.in_db
64
- elif hasattr(registry, "source_id"):
65
- from_source = True
66
- else:
67
- from_source = False
68
-
69
- if from_source:
70
- records_bionty, unmapped_values = create_records_from_source(
54
+ if registry.__base__.__name__ == "BioRecord":
55
+ from bionty._organism import is_organism_required
56
+
57
+ # if can and needed, get organism record from the existing records
58
+ if (
59
+ organism_record is None
60
+ and len(records) > 0
61
+ and is_organism_required(registry)
62
+ ):
63
+ organism_record = records[0].organism
64
+ records_public, unmapped_values = create_records_from_source(
71
65
  iterable_idx=nonexist_values,
72
66
  field=field,
73
- organism=organism,
74
- source=source_record,
67
+ organism=organism_record,
68
+ source=source,
75
69
  msg=msg,
76
70
  mute=mute,
77
71
  )
78
- if len(records_bionty) > 0:
72
+ if len(records_public) > 0:
79
73
  msg = ""
80
- for record in records_bionty:
74
+ for record in records_public:
81
75
  record._from_source = True
82
- records += records_bionty
76
+ records += records_public
83
77
  else:
84
78
  unmapped_values = nonexist_values
85
79
  # unmapped new_ids will NOT create records
86
80
  if len(unmapped_values) > 0:
81
+ # first log the success message
87
82
  if len(msg) > 0 and not mute:
88
83
  logger.success(msg)
89
84
  s = "" if len(unmapped_values) == 1 else "s"
90
85
  print_values = colors.yellow(_format_values(unmapped_values))
91
- name = registry.__name__
92
86
  n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
93
87
  if not mute:
94
88
  logger.warning(
95
- f"{colors.red('did not create')} {name} record{s} for "
89
+ f"{colors.red('did not create')} {registry.__name__} record{s} for "
96
90
  f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}" # type: ignore
97
91
  )
98
92
  return RecordList(records)
@@ -100,25 +94,21 @@ def get_or_create_records(
100
94
 
101
95
  def get_existing_records(
102
96
  iterable_idx: pd.Index,
103
- field: StrField,
97
+ field: FieldAttr,
104
98
  organism: Record | None = None,
105
99
  mute: bool = False,
106
- ):
100
+ ) -> tuple[list, pd.Index, str]:
101
+ """Get existing records from the database."""
107
102
  # NOTE: existing records matching is agnostic to the source
108
103
  model = field.field.model # type: ignore
109
- if organism is None and field.field.name == "ensembl_gene_id": # type: ignore
110
- if len(iterable_idx) > 0:
111
- organism = _ensembl_prefix(iterable_idx[0], field, organism) # type: ignore
112
- organism = _get_organism_record(field, organism, force=True)
113
104
 
114
- # standardize based on the DB reference
115
105
  # log synonyms mapped terms
116
106
  syn_mapper = model.standardize(
117
107
  iterable_idx,
118
108
  field=field,
119
109
  organism=organism,
120
110
  mute=True,
121
- public_aware=False,
111
+ source_aware=False, # standardize only based on the DB reference
122
112
  return_mapper=True,
123
113
  )
124
114
  iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
@@ -137,7 +127,6 @@ def get_existing_records(
137
127
  is_validated = model.validate(
138
128
  iterable_idx, field=field, organism=organism, mute=True
139
129
  )
140
-
141
130
  if len(is_validated) > 0:
142
131
  validated = iterable_idx[is_validated]
143
132
  else:
@@ -151,7 +140,7 @@ def get_existing_records(
151
140
  msg = (
152
141
  "loaded"
153
142
  f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
154
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}" # type: ignore
143
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
155
144
  )
156
145
  if len(syn_mapper) > 0:
157
146
  s = "" if len(syn_mapper) == 1 else "s"
@@ -173,15 +162,13 @@ def get_existing_records(
173
162
  msg = ""
174
163
 
175
164
  # get all existing records in the db
176
- # if necessary, create records for the values in kwargs
177
- # k:v -> k:v_record
178
165
  query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore
179
166
  if organism is not None:
180
167
  query["organism"] = organism
181
168
  records = model.filter(**query).list()
182
169
 
183
170
  if len(validated) == len(iterable_idx):
184
- return records, [], msg
171
+ return records, pd.Index([]), msg
185
172
  else:
186
173
  nonval_values = iterable_idx.difference(validated)
187
174
  return records, nonval_values, msg
@@ -189,33 +176,35 @@ def get_existing_records(
189
176
 
190
177
  def create_records_from_source(
191
178
  iterable_idx: pd.Index,
192
- field: StrField,
179
+ field: FieldAttr,
193
180
  organism: Record | None = None,
194
181
  source: Record | None = None,
195
182
  msg: str = "",
196
183
  mute: bool = False,
197
- ):
184
+ ) -> tuple[list, pd.Index]:
185
+ """Create records from source."""
198
186
  model = field.field.model # type: ignore
199
187
  records: list = []
200
- # populate additional fields from bionty
201
- from bionty._bionty import get_source_record
202
- from bionty.core._bionty import filter_bionty_df_columns
188
+ # populate additional fields from public_df
189
+ from bionty._source import filter_public_df_columns, get_source_record
190
+
191
+ # get the default source
192
+ source_record = get_source_record(model, organism, source)
203
193
 
204
- # create the corresponding bionty object from model
194
+ # create the corresponding PublicOntology object from model
205
195
  try:
206
- # TODO: more generic
207
- public_ontology = model.public(organism=organism, source=source)
196
+ public_ontology = model.public(source=source_record)
208
197
  except Exception:
209
- # for custom records that are not created from public sources
198
+ # no public source
210
199
  return records, iterable_idx
211
- # get the default source
212
- if source is None:
213
- source = get_source_record(public_ontology, model)
214
200
 
215
- # filter the columns in bionty df based on fields
216
- bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
201
+ # filter the columns in public df based on fields
202
+ public_df = filter_public_df_columns(model=model, public_ontology=public_ontology)
203
+
204
+ if public_df.empty:
205
+ return records, iterable_idx
217
206
 
218
- # standardize in the bionty reference
207
+ # standardize in the public reference
219
208
  # do not inspect synonyms if the field is not name field
220
209
  inspect_synonyms = True
221
210
  if hasattr(model, "_name_field") and field.field.name != model._name_field: # type: ignore
@@ -241,27 +230,30 @@ def create_records_from_source(
241
230
 
242
231
  iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
243
232
 
244
- # create records for values that are found in the bionty reference
233
+ # create records for values that are found in the public reference
245
234
  # matching either field or synonyms
246
- mapped_values = iterable_idx.intersection(bionty_df[field.field.name]) # type: ignore
235
+ mapped_values = iterable_idx.intersection(public_df[field.field.name]) # type: ignore
247
236
 
248
237
  multi_msg = ""
249
238
  if len(mapped_values) > 0:
250
- bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
239
+ public_kwargs, multi_msg = _bulk_create_dicts_from_df(
251
240
  keys=mapped_values,
252
241
  column_name=field.field.name, # type: ignore
253
- df=bionty_df,
242
+ df=public_df,
254
243
  )
255
244
 
256
- if hasattr(model, "organism_id") and organism is None:
257
- organism = _get_organism_record(field, source.organism, force=True)
245
+ # this here is needed when the organism is required to create new records
246
+ if organism is None:
247
+ organism = get_organism_record_from_field(
248
+ field, source_record.organism, values=mapped_values
249
+ )
258
250
 
259
251
  create_kwargs = (
260
- {"organism": organism, "source": source}
252
+ {"organism": organism, "source": source_record}
261
253
  if organism is not None
262
- else {"source": source}
254
+ else {"source": source_record}
263
255
  )
264
- for bk in bionty_kwargs:
256
+ for bk in public_kwargs:
265
257
  records.append(model(**bk, **create_kwargs, _skip_validation=True))
266
258
 
267
259
  # number of records that matches field (not synonyms)
@@ -286,12 +278,13 @@ def create_records_from_source(
286
278
  if len(multi_msg) > 0 and not mute:
287
279
  logger.warning(multi_msg)
288
280
 
289
- # return the values that are not found in the bionty reference
281
+ # return the values that are not found in the public reference
290
282
  unmapped_values = iterable_idx.difference(mapped_values)
291
283
  return records, unmapped_values
292
284
 
293
285
 
294
- def index_iterable(iterable: Iterable) -> pd.Index:
286
+ def index_iterable(iterable: ListLike) -> pd.Index:
287
+ """Get unique values from an iterable."""
295
288
  idx = pd.Index(iterable).unique()
296
289
  # No entries are made for NAs, '', None
297
290
  # returns an ordered unique not null list
@@ -299,8 +292,9 @@ def index_iterable(iterable: Iterable) -> pd.Index:
299
292
 
300
293
 
301
294
  def _format_values(
302
- names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
295
+ names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'"
303
296
  ) -> str:
297
+ """Format values for printing."""
304
298
  if isinstance(names, dict):
305
299
  items = {
306
300
  f"{key}: {value}": None
@@ -345,50 +339,47 @@ def _bulk_create_dicts_from_df(
345
339
  return df.reset_index().to_dict(orient="records"), multi_msg
346
340
 
347
341
 
348
- def _has_organism_field(registry: type[Record]) -> bool:
349
- try:
350
- registry._meta.get_field("organism")
351
- return True
352
- except FieldDoesNotExist:
353
- return False
354
-
355
-
356
- def _get_organism_record( # type: ignore
357
- field: StrField, organism: str | Record, force: bool = False
358
- ) -> Record:
342
+ def get_organism_record_from_field( # type: ignore
343
+ field: FieldAttr,
344
+ organism: str | Record | None = None,
345
+ values: ListLike = None,
346
+ using_key: str | None = None,
347
+ ) -> Record | None:
359
348
  """Get organism record.
360
349
 
361
350
  Args:
362
351
  field: the field to get the organism record for
363
352
  organism: the organism to get the record for
364
- force: whether to force fetching the organism record
365
- """
366
- registry = field.field.model # type: ignore
367
- check = True
368
- if not force and hasattr(registry, "_ontology_id_field"):
369
- check = field.field.name != registry._ontology_id_field # type: ignore
370
- # e.g. bionty.CellMarker has "name" as _ontology_id_field
371
- if not registry._ontology_id_field.endswith("id"):
372
- check = True
353
+ values: the values to get the organism record for
354
+ using_key: the db to get the organism record for
373
355
 
374
- if _has_organism_field(registry) and check:
375
- from bionty._bionty import create_or_get_organism_record
376
-
377
- if field and not isinstance(field, str):
378
- field = field.field.name
356
+ Returns:
357
+ The organism record if:
358
+ The organism FK is required for the registry
359
+ The field is not unique or the organism is not None
360
+ """
361
+ if values is None:
362
+ values = []
363
+ registry = field.field.model
364
+ field_str = field.field.name
365
+ # id field is a unique field that's not a relation
366
+ is_simple_field_unique = field.field.unique and not field.field.is_relation
367
+ check = not is_simple_field_unique or organism is not None
368
+
369
+ if (
370
+ registry.__get_name_with_module__() == "bionty.Gene"
371
+ and field.field.name == "ensembl_gene_id"
372
+ and len(values) > 0
373
+ and organism is None
374
+ ): # type: ignore
375
+ from bionty._organism import organism_from_ensembl_id
376
+
377
+ return organism_from_ensembl_id(values[0], using_key) # type: ignore
378
+
379
+ if registry.__base__.__name__ == "BioRecord" and check:
380
+ from bionty._organism import create_or_get_organism_record
379
381
 
380
382
  organism_record = create_or_get_organism_record(
381
- organism=organism, registry=registry, field=field
383
+ organism=organism, registry=registry, field=field_str
382
384
  )
383
- if organism_record is not None:
384
- return organism_record
385
-
386
-
387
- def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
388
- if field.field.name == "ensembl_gene_id" and organism is None: # type: ignore
389
- if id.startswith("ENSG"):
390
- organism = "human" # type: ignore
391
- elif id.startswith("ENSMUSG"):
392
- organism = "mouse" # type: ignore
393
-
394
- return organism
385
+ return organism_record
@@ -142,7 +142,7 @@ def _save_validated_records(
142
142
  # save labels from ontology_ids
143
143
  if hasattr(registry, "_ontology_id_field") and label_uids:
144
144
  try:
145
- records = registry.from_values(label_uids, field=field)
145
+ records = registry.from_values(label_uids, field=field, mute=True)
146
146
  save([r for r in records if r._state.adding])
147
147
  except Exception: # noqa: S110
148
148
  pass
@@ -240,7 +240,7 @@ class LabelManager:
240
240
  continue
241
241
  # look for features
242
242
  data_name_lower = data.__class__.__name__.lower()
243
- labels_by_features = defaultdict(list)
243
+ labels_by_features: dict = defaultdict(list)
244
244
  features = set()
245
245
  new_labels = save_validated_records(labels)
246
246
  if len(new_labels) > 0:
@@ -248,18 +248,24 @@ class LabelManager:
248
248
  new_labels, using_key, transfer_logs=transfer_logs
249
249
  )
250
250
  for label in labels:
251
+ keys: list = []
251
252
  # if the link table doesn't follow this convention, we'll ignore it
252
253
  if not hasattr(label, f"links_{data_name_lower}"):
253
254
  key = None
255
+ keys.append(key)
254
256
  else:
255
- link = getattr(label, f"links_{data_name_lower}").get(
256
- **{f"{data_name_lower}_id": data.id}
257
+ links = (
258
+ getattr(label, f"links_{data_name_lower}")
259
+ .filter(**{f"{data_name_lower}_id": data.id})
260
+ .all()
257
261
  )
258
- if link.feature is not None:
259
- features.add(link.feature)
260
- key = link.feature.name
261
- else:
262
- key = None
262
+ for link in links:
263
+ if link.feature is not None:
264
+ features.add(link.feature)
265
+ key = link.feature.name
266
+ else:
267
+ key = None
268
+ keys.append(key)
263
269
  label_returned = transfer_to_default_db(
264
270
  label,
265
271
  using_key,
@@ -270,7 +276,8 @@ class LabelManager:
270
276
  # TODO: refactor return value of transfer to default db
271
277
  if label_returned is not None:
272
278
  label = label_returned
273
- labels_by_features[key].append(label)
279
+ for key in keys:
280
+ labels_by_features[key].append(label)
274
281
  # treat features
275
282
  new_features = save_validated_records(list(features))
276
283
  if len(new_features) > 0: