lamindb 1.2a2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,28 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import re
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  import pandas as pd
6
7
  from django.core.exceptions import FieldDoesNotExist
7
8
  from lamin_utils import colors, logger
8
9
 
9
- from .record import Record
10
-
11
10
  if TYPE_CHECKING:
12
11
  from collections.abc import Iterable
13
12
 
14
- from lamindb.base.types import ListLike, StrField
13
+ from lamindb.base.types import FieldAttr, ListLike
15
14
 
16
15
  from .query_set import RecordList
16
+ from .record import Record
17
17
 
18
18
 
19
19
  # The base function for `from_values`
20
- def get_or_create_records(
20
+ def _from_values(
21
21
  iterable: ListLike,
22
- field: StrField,
22
+ field: FieldAttr,
23
23
  *,
24
24
  create: bool = False,
25
- from_source: bool = False,
26
25
  organism: Record | str | None = None,
27
26
  source: Record | None = None,
28
27
  mute: bool = False,
@@ -31,47 +30,44 @@ def get_or_create_records(
31
30
  from .query_set import RecordList
32
31
 
33
32
  registry = field.field.model # type: ignore
33
+ organism_record = _get_organism_record(field, organism, values=iterable)
34
+ # TODO: the create is problematic if field is not a name field
34
35
  if create:
35
- return RecordList([registry(**{field.field.name: value}) for value in iterable]) # type: ignore
36
- organism = _get_organism_record(field, organism)
36
+ create_kwargs = {}
37
+ if organism_record:
38
+ create_kwargs["organism"] = organism_record
39
+ return RecordList(
40
+ [
41
+ registry(**{field.field.name: value}, **create_kwargs)
42
+ for value in iterable
43
+ ]
44
+ ) # type: ignore
45
+
37
46
  iterable_idx = index_iterable(iterable)
38
47
 
39
48
  # returns existing records & non-existing values
40
49
  records, nonexist_values, msg = get_existing_records(
41
50
  iterable_idx=iterable_idx,
42
51
  field=field,
43
- organism=organism,
52
+ organism=organism_record,
44
53
  mute=mute,
45
54
  )
46
55
 
47
56
  # new records to be created based on new values
48
57
  if len(nonexist_values) > 0:
49
- source_record = None
50
- if from_source:
51
- if isinstance(source, Record):
52
- source_record = source
53
- if not source_record and hasattr(registry, "public"):
54
- if organism is None:
55
- organism = _ensembl_prefix(nonexist_values[0], field, organism)
56
- organism = _get_organism_record(field, organism, force=True)
57
-
58
- if source_record:
59
- from bionty.core._add_ontology import check_source_in_db
60
-
61
- check_source_in_db(registry=registry, source=source_record)
62
-
63
- from_source = not source_record.in_db
64
- elif hasattr(registry, "source_id"):
65
- from_source = True
66
- else:
67
- from_source = False
68
-
69
- if from_source:
58
+ if hasattr(registry, "source_id"):
59
+ # if can and needed, get organism record from the existing records
60
+ if (
61
+ organism_record is None
62
+ and len(records) > 0
63
+ and _is_organism_required(registry)
64
+ ):
65
+ organism_record = records[0].organism
70
66
  records_bionty, unmapped_values = create_records_from_source(
71
67
  iterable_idx=nonexist_values,
72
68
  field=field,
73
- organism=organism,
74
- source=source_record,
69
+ organism=organism_record,
70
+ source=source,
75
71
  msg=msg,
76
72
  mute=mute,
77
73
  )
@@ -84,15 +80,15 @@ def get_or_create_records(
84
80
  unmapped_values = nonexist_values
85
81
  # unmapped new_ids will NOT create records
86
82
  if len(unmapped_values) > 0:
83
+ # first log the success message
87
84
  if len(msg) > 0 and not mute:
88
85
  logger.success(msg)
89
86
  s = "" if len(unmapped_values) == 1 else "s"
90
87
  print_values = colors.yellow(_format_values(unmapped_values))
91
- name = registry.__name__
92
88
  n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
93
89
  if not mute:
94
90
  logger.warning(
95
- f"{colors.red('did not create')} {name} record{s} for "
91
+ f"{colors.red('did not create')} {registry.__name__} record{s} for "
96
92
  f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}" # type: ignore
97
93
  )
98
94
  return RecordList(records)
@@ -100,25 +96,21 @@ def get_or_create_records(
100
96
 
101
97
  def get_existing_records(
102
98
  iterable_idx: pd.Index,
103
- field: StrField,
99
+ field: FieldAttr,
104
100
  organism: Record | None = None,
105
101
  mute: bool = False,
106
- ):
102
+ ) -> tuple[list, pd.Index, str]:
103
+ """Get existing records from the database."""
107
104
  # NOTE: existing records matching is agnostic to the source
108
105
  model = field.field.model # type: ignore
109
- if organism is None and field.field.name == "ensembl_gene_id": # type: ignore
110
- if len(iterable_idx) > 0:
111
- organism = _ensembl_prefix(iterable_idx[0], field, organism) # type: ignore
112
- organism = _get_organism_record(field, organism, force=True)
113
106
 
114
- # standardize based on the DB reference
115
107
  # log synonyms mapped terms
116
108
  syn_mapper = model.standardize(
117
109
  iterable_idx,
118
110
  field=field,
119
111
  organism=organism,
120
112
  mute=True,
121
- public_aware=False,
113
+ source_aware=False, # standardize only based on the DB reference
122
114
  return_mapper=True,
123
115
  )
124
116
  iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
@@ -137,7 +129,6 @@ def get_existing_records(
137
129
  is_validated = model.validate(
138
130
  iterable_idx, field=field, organism=organism, mute=True
139
131
  )
140
-
141
132
  if len(is_validated) > 0:
142
133
  validated = iterable_idx[is_validated]
143
134
  else:
@@ -151,7 +142,7 @@ def get_existing_records(
151
142
  msg = (
152
143
  "loaded"
153
144
  f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
154
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}" # type: ignore
145
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
155
146
  )
156
147
  if len(syn_mapper) > 0:
157
148
  s = "" if len(syn_mapper) == 1 else "s"
@@ -173,15 +164,13 @@ def get_existing_records(
173
164
  msg = ""
174
165
 
175
166
  # get all existing records in the db
176
- # if necessary, create records for the values in kwargs
177
- # k:v -> k:v_record
178
167
  query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore
179
168
  if organism is not None:
180
169
  query["organism"] = organism
181
170
  records = model.filter(**query).list()
182
171
 
183
172
  if len(validated) == len(iterable_idx):
184
- return records, [], msg
173
+ return records, pd.Index([]), msg
185
174
  else:
186
175
  nonval_values = iterable_idx.difference(validated)
187
176
  return records, nonval_values, msg
@@ -189,12 +178,13 @@ def get_existing_records(
189
178
 
190
179
  def create_records_from_source(
191
180
  iterable_idx: pd.Index,
192
- field: StrField,
181
+ field: FieldAttr,
193
182
  organism: Record | None = None,
194
183
  source: Record | None = None,
195
184
  msg: str = "",
196
185
  mute: bool = False,
197
- ):
186
+ ) -> tuple[list, pd.Index]:
187
+ """Create records from source."""
198
188
  model = field.field.model # type: ignore
199
189
  records: list = []
200
190
  # populate additional fields from bionty
@@ -253,8 +243,11 @@ def create_records_from_source(
253
243
  df=bionty_df,
254
244
  )
255
245
 
256
- if hasattr(model, "organism_id") and organism is None:
257
- organism = _get_organism_record(field, source.organism, force=True)
246
+ # this here is needed when the organism is required to create new records
247
+ if organism is None:
248
+ organism = _get_organism_record(
249
+ field, source.organism, values=mapped_values
250
+ )
258
251
 
259
252
  create_kwargs = (
260
253
  {"organism": organism, "source": source}
@@ -292,6 +285,7 @@ def create_records_from_source(
292
285
 
293
286
 
294
287
  def index_iterable(iterable: Iterable) -> pd.Index:
288
+ """Get unique values from an iterable."""
295
289
  idx = pd.Index(iterable).unique()
296
290
  # No entries are made for NAs, '', None
297
291
  # returns an ordered unique not null list
@@ -301,6 +295,7 @@ def index_iterable(iterable: Iterable) -> pd.Index:
301
295
  def _format_values(
302
296
  names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
303
297
  ) -> str:
298
+ """Format values for printing."""
304
299
  if isinstance(names, dict):
305
300
  items = {
306
301
  f"{key}: {value}": None
@@ -345,50 +340,90 @@ def _bulk_create_dicts_from_df(
345
340
  return df.reset_index().to_dict(orient="records"), multi_msg
346
341
 
347
342
 
348
- def _has_organism_field(registry: type[Record]) -> bool:
343
+ def _is_organism_required(registry: type[Record]) -> bool:
344
+ """Check if the registry has an organism field and is required.
345
+
346
+ Returns:
347
+ True if the registry has an organism field and is required, False otherwise.
348
+ """
349
349
  try:
350
- registry._meta.get_field("organism")
351
- return True
350
+ organism_field = registry._meta.get_field("organism")
351
+ # organism is not required or not a relation
352
+ if organism_field.null or not organism_field.is_relation:
353
+ return False
354
+ else:
355
+ return True
352
356
  except FieldDoesNotExist:
353
357
  return False
354
358
 
355
359
 
360
+ def _is_simple_field_unique(field: FieldAttr) -> bool:
361
+ """Check if the field is an id field."""
362
+ # id field is a unique field that's not a relation
363
+ field = field.field
364
+ if field.unique and not field.is_relation:
365
+ return True
366
+ return False
367
+
368
+
356
369
  def _get_organism_record( # type: ignore
357
- field: StrField, organism: str | Record, force: bool = False
358
- ) -> Record:
370
+ field: FieldAttr,
371
+ organism: str | Record | None = None,
372
+ values: Iterable = [],
373
+ using_key: str | None = None,
374
+ ) -> Record | None:
359
375
  """Get organism record.
360
376
 
361
377
  Args:
362
378
  field: the field to get the organism record for
363
379
  organism: the organism to get the record for
364
- force: whether to force fetching the organism record
380
+ values: the values to get the organism record for
381
+ using_key: the db to get the organism record for
382
+
383
+ Returns:
384
+ The organism record if:
385
+ The organism FK is required for the registry
386
+ The field is not unique or the organism is not None
365
387
  """
366
- registry = field.field.model # type: ignore
367
- check = True
368
- if not force and hasattr(registry, "_ontology_id_field"):
369
- check = field.field.name != registry._ontology_id_field # type: ignore
370
- # e.g. bionty.CellMarker has "name" as _ontology_id_field
371
- if not registry._ontology_id_field.endswith("id"):
372
- check = True
373
-
374
- if _has_organism_field(registry) and check:
375
- from bionty._bionty import create_or_get_organism_record
388
+ registry = field.field.model
389
+ field_str = field.field.name
390
+ check = not _is_simple_field_unique(field=field) or organism is not None
391
+
392
+ if field_str == "ensembl_gene_id" and len(values) > 0 and organism is None: # type: ignore
393
+ return _organism_from_ensembl_id(values[0], using_key) # type: ignore
376
394
 
377
- if field and not isinstance(field, str):
378
- field = field.field.name
395
+ if _is_organism_required(registry) and check:
396
+ from bionty._bionty import create_or_get_organism_record
379
397
 
380
398
  organism_record = create_or_get_organism_record(
381
- organism=organism, registry=registry, field=field
399
+ organism=organism, registry=registry, field=field_str
382
400
  )
383
401
  if organism_record is not None:
384
- return organism_record
402
+ return organism_record.save()
385
403
 
386
404
 
387
- def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
388
- if field.field.name == "ensembl_gene_id" and organism is None: # type: ignore
389
- if id.startswith("ENSG"):
390
- organism = "human" # type: ignore
391
- elif id.startswith("ENSMUSG"):
392
- organism = "mouse" # type: ignore
405
+ def _organism_from_ensembl_id(id: str, using_key: str | None) -> Record | None: # type: ignore
406
+ """Get organism record from ensembl id."""
407
+ import bionty as bt
408
+ from bionty.base.dev._io import s3_bionty_assets
409
+
410
+ localpath = s3_bionty_assets(
411
+ ".lamindb/0QeqXlKq9aqW8aqe0000.parquet", bt.base.settings.versionsdir
412
+ )
413
+ ensembl_prefixes = pd.read_parquet(localpath).set_index("gene_prefix")
414
+
415
+ prefix = re.sub(r"\d+", "", id)
416
+ if prefix in ensembl_prefixes.index:
417
+ organism_name = ensembl_prefixes.loc[prefix, "name"].lower()
418
+
419
+ using_key = None if using_key == "default" else using_key
420
+
421
+ organism_record = (
422
+ bt.Organism.using(using_key).filter(name=organism_name).one_or_none()
423
+ )
424
+ if organism_record is None:
425
+ organism_record = bt.Organism.from_source(name=organism_name)
426
+ if organism_record is not None:
427
+ organism_record.save(using=using_key)
393
428
 
394
- return organism
429
+ return organism_record