lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +15 -24
- lamindb/_context.py +5 -2
- lamindb/_dataset.py +6 -3
- lamindb/_delete.py +6 -6
- lamindb/_feature.py +61 -26
- lamindb/_feature_manager.py +176 -0
- lamindb/_feature_set.py +63 -27
- lamindb/_file.py +120 -76
- lamindb/_from_values.py +88 -28
- lamindb/_label.py +85 -0
- lamindb/_logger.py +1 -1
- lamindb/_manager.py +24 -17
- lamindb/_orm.py +157 -33
- lamindb/_queryset.py +37 -35
- lamindb/_save.py +19 -9
- lamindb/_transform.py +12 -3
- lamindb/_view.py +1 -1
- lamindb/dev/__init__.py +4 -0
- lamindb/dev/_settings.py +1 -1
- lamindb/dev/_view_parents.py +70 -34
- lamindb/dev/datasets/__init__.py +12 -0
- lamindb/dev/datasets/_core.py +116 -65
- lamindb/dev/storage/__init__.py +1 -5
- lamindb/dev/storage/_backed_access.py +505 -379
- lamindb/dev/storage/file.py +3 -1
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/METADATA +10 -8
- lamindb-0.48.1.dist-info/RECORD +42 -0
- lamindb/_category.py +0 -42
- lamindb-0.48a2.dist-info/RECORD +0 -41
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/LICENSE +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/WHEEL +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/entry_points.txt +0 -0
lamindb/_file.py
CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
|
|
8
8
|
from anndata import AnnData
|
9
9
|
from appdirs import AppDirs
|
10
10
|
from django.db.models.query_utils import DeferredAttribute as Field
|
11
|
-
from
|
11
|
+
from lamin_utils import colors, logger
|
12
12
|
from lamindb_setup import settings as setup_settings
|
13
13
|
from lamindb_setup._init_instance import register_storage
|
14
14
|
from lamindb_setup.dev import StorageSettings
|
@@ -17,6 +17,7 @@ from lnschema_core import Feature, FeatureSet, File, Run, ids
|
|
17
17
|
from lnschema_core.types import AnnDataLike, DataLike, PathLike
|
18
18
|
|
19
19
|
from lamindb._context import context
|
20
|
+
from lamindb.dev import FeatureManager
|
20
21
|
from lamindb.dev._settings import settings
|
21
22
|
from lamindb.dev.hashing import b16_to_b64, hash_file
|
22
23
|
from lamindb.dev.storage import (
|
@@ -27,24 +28,15 @@ from lamindb.dev.storage import (
|
|
27
28
|
size_adata,
|
28
29
|
write_to_file,
|
29
30
|
)
|
31
|
+
from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
|
30
32
|
from lamindb.dev.storage.file import auto_storage_key_from_file, filepath_from_file
|
31
33
|
from lamindb.dev.utils import attach_func_to_class_method
|
32
34
|
|
33
35
|
from . import _TESTING
|
36
|
+
from ._feature import convert_numpy_dtype_to_lamin_feature_type
|
34
37
|
from .dev._view_parents import view_lineage
|
35
38
|
from .dev.storage.file import AUTO_KEY_PREFIX
|
36
39
|
|
37
|
-
try:
|
38
|
-
from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
|
39
|
-
except ImportError:
|
40
|
-
|
41
|
-
class AnnDataAccessor: # type: ignore
|
42
|
-
pass
|
43
|
-
|
44
|
-
class BackedAccessor: # type: ignore
|
45
|
-
pass
|
46
|
-
|
47
|
-
|
48
40
|
DIRS = AppDirs("lamindb", "laminlabs")
|
49
41
|
|
50
42
|
|
@@ -362,6 +354,19 @@ def data_is_anndata(data: DataLike):
|
|
362
354
|
return False
|
363
355
|
|
364
356
|
|
357
|
+
def data_is_mudata(data: DataLike):
|
358
|
+
try:
|
359
|
+
from mudata import MuData
|
360
|
+
except ModuleNotFoundError:
|
361
|
+
return False
|
362
|
+
|
363
|
+
if isinstance(data, MuData):
|
364
|
+
return True
|
365
|
+
if isinstance(data, (str, Path, UPath)):
|
366
|
+
return Path(data).suffix in {".h5mu"}
|
367
|
+
return False
|
368
|
+
|
369
|
+
|
365
370
|
def __init__(file: File, *args, **kwargs):
|
366
371
|
# Below checks for the Django-internal call in from_db()
|
367
372
|
# it'd be better if we could avoid this, but not being able to create a File
|
@@ -383,9 +388,6 @@ def __init__(file: File, *args, **kwargs):
|
|
383
388
|
description: Optional[str] = (
|
384
389
|
kwargs.pop("description") if "description" in kwargs else None
|
385
390
|
)
|
386
|
-
feature_sets: Optional[List[FeatureSet]] = (
|
387
|
-
kwargs.pop("feature_sets") if "feature_sets" in kwargs else None
|
388
|
-
)
|
389
391
|
name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
|
390
392
|
format = kwargs.pop("format") if "format" in kwargs else None
|
391
393
|
log_hint = kwargs.pop("log_hint") if "log_hint" in kwargs else True
|
@@ -394,9 +396,7 @@ def __init__(file: File, *args, **kwargs):
|
|
394
396
|
)
|
395
397
|
|
396
398
|
if not len(kwargs) == 0:
|
397
|
-
raise ValueError(
|
398
|
-
"Only data, key, run, description & feature_sets can be passed."
|
399
|
-
)
|
399
|
+
raise ValueError("Only data, key, run, description can be passed.")
|
400
400
|
|
401
401
|
if name is not None and description is not None:
|
402
402
|
raise ValueError("Only pass description, do not pass a name")
|
@@ -404,21 +404,8 @@ def __init__(file: File, *args, **kwargs):
|
|
404
404
|
logger.warning("Argument `name` is deprecated, please use `description`")
|
405
405
|
description = name
|
406
406
|
|
407
|
-
if feature_sets is None:
|
408
|
-
feature_sets = []
|
409
|
-
if isinstance(data, pd.DataFrame) and log_hint:
|
410
|
-
logger.hint(
|
411
|
-
"This is a dataframe, consider using File.from_df() to link column"
|
412
|
-
" names as features!"
|
413
|
-
)
|
414
|
-
elif data_is_anndata(data) and log_hint:
|
415
|
-
logger.hint(
|
416
|
-
"This is AnnDataLike, consider using File.from_anndata() to link var"
|
417
|
-
" and obs.columns as features!"
|
418
|
-
)
|
419
|
-
|
420
407
|
provisional_id = ids.base62_20()
|
421
|
-
|
408
|
+
kwargs_or_file, privates = get_file_kwargs_from_data(
|
422
409
|
data=data,
|
423
410
|
key=key,
|
424
411
|
run=run,
|
@@ -426,17 +413,38 @@ def __init__(file: File, *args, **kwargs):
|
|
426
413
|
provisional_id=provisional_id,
|
427
414
|
skip_check_exists=skip_check_exists,
|
428
415
|
)
|
416
|
+
|
429
417
|
# an object with the same hash already exists
|
430
|
-
if isinstance(
|
418
|
+
if isinstance(kwargs_or_file, File):
|
431
419
|
# this is the way Django instantiates from the DB internally
|
432
420
|
# https://github.com/django/django/blob/549d6ffeb6d626b023acc40c3bb2093b4b25b3d6/django/db/models/base.py#LL488C1-L491C51
|
433
421
|
new_args = [
|
434
|
-
getattr(
|
422
|
+
getattr(kwargs_or_file, field.attname)
|
423
|
+
for field in file._meta.concrete_fields
|
435
424
|
]
|
436
425
|
super(File, file).__init__(*new_args)
|
437
426
|
file._state.adding = False
|
438
427
|
file._state.db = "default"
|
439
428
|
return None
|
429
|
+
else:
|
430
|
+
kwargs = kwargs_or_file
|
431
|
+
|
432
|
+
if isinstance(data, pd.DataFrame):
|
433
|
+
if log_hint:
|
434
|
+
logger.hint(
|
435
|
+
"This is a dataframe, consider using File.from_df() to link column"
|
436
|
+
" names as features!"
|
437
|
+
)
|
438
|
+
kwargs["accessor"] = "DataFrame"
|
439
|
+
elif data_is_anndata(data):
|
440
|
+
if log_hint:
|
441
|
+
logger.hint(
|
442
|
+
"This is AnnDataLike, consider using File.from_anndata() to link"
|
443
|
+
" var_names and obs.columns as features!"
|
444
|
+
)
|
445
|
+
kwargs["accessor"] = "AnnData"
|
446
|
+
elif data_is_mudata(data):
|
447
|
+
kwargs["accessor"] = "MuData"
|
440
448
|
|
441
449
|
kwargs["id"] = provisional_id
|
442
450
|
kwargs["description"] = description
|
@@ -465,9 +473,6 @@ def __init__(file: File, *args, **kwargs):
|
|
465
473
|
file._cloud_filepath = privates["cloud_filepath"]
|
466
474
|
file._memory_rep = privates["memory_rep"]
|
467
475
|
file._to_store = not privates["check_path_in_storage"]
|
468
|
-
file._feature_sets = (
|
469
|
-
feature_sets if isinstance(feature_sets, list) else [feature_sets]
|
470
|
-
)
|
471
476
|
|
472
477
|
super(File, file).__init__(**kwargs)
|
473
478
|
|
@@ -484,9 +489,8 @@ def from_df(
|
|
484
489
|
) -> "File":
|
485
490
|
"""{}"""
|
486
491
|
file = File(data=df, key=key, run=run, description=description, log_hint=False)
|
487
|
-
|
488
|
-
|
489
|
-
file._feature_sets = [feature_set]
|
492
|
+
feature_set = FeatureSet.from_df(df)
|
493
|
+
file._feature_sets = {"columns": feature_set}
|
490
494
|
return file
|
491
495
|
|
492
496
|
|
@@ -512,9 +516,25 @@ def from_anndata(
|
|
512
516
|
data_parse = backed_access(filepath)
|
513
517
|
else:
|
514
518
|
data_parse = ad.read(filepath, backed="r")
|
515
|
-
|
516
|
-
|
517
|
-
|
519
|
+
type = "float"
|
520
|
+
else:
|
521
|
+
type = convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
|
522
|
+
feature_sets = {}
|
523
|
+
logger.info("Parsing feature names of X, stored in slot .var")
|
524
|
+
logger.indent = " "
|
525
|
+
feature_set_x = FeatureSet.from_values(
|
526
|
+
data_parse.var.index,
|
527
|
+
var_ref,
|
528
|
+
type=type,
|
529
|
+
)
|
530
|
+
feature_sets["var"] = feature_set_x
|
531
|
+
logger.indent = ""
|
532
|
+
if len(data_parse.obs.columns) > 0:
|
533
|
+
logger.info("Parsing feature names of slot .obs")
|
534
|
+
logger.indent = " "
|
535
|
+
feature_set_obs = FeatureSet.from_df(data_parse.obs)
|
536
|
+
feature_sets["obs"] = feature_set_obs
|
537
|
+
logger.indent = ""
|
518
538
|
file._feature_sets = feature_sets
|
519
539
|
return file
|
520
540
|
|
@@ -526,19 +546,13 @@ def from_dir(
|
|
526
546
|
path: PathLike,
|
527
547
|
*,
|
528
548
|
run: Optional[Run] = None,
|
549
|
+
storage_root: Optional[PathLike] = None,
|
529
550
|
) -> List["File"]:
|
530
551
|
"""{}"""
|
531
552
|
folderpath = UPath(path)
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
folder_key = get_relative_path_to_root(path=folderpath).as_posix()
|
536
|
-
else:
|
537
|
-
raise RuntimeError(
|
538
|
-
"Currently, only directories in default storage can be registered!\n"
|
539
|
-
"You can either move your folder into the current default storage"
|
540
|
-
"or add a new default storage through `ln.settings.storage`"
|
541
|
-
)
|
553
|
+
folder_key = get_relative_path_to_root(
|
554
|
+
path=folderpath, root=storage_root
|
555
|
+
).as_posix()
|
542
556
|
# always sanitize by stripping a trailing slash
|
543
557
|
folder_key = folder_key.rstrip("/")
|
544
558
|
logger.hint(f"using storage prefix = {folder_key}/")
|
@@ -617,17 +631,18 @@ def backed(
|
|
617
631
|
" one of the following suffixes for the object name:"
|
618
632
|
f" {', '.join(suffixes)}."
|
619
633
|
)
|
620
|
-
_track_run_input(self, is_run_input)
|
621
|
-
# consider the case where an object is already locally cached
|
622
|
-
local_path = setup_settings.instance.storage.cloud_to_local_no_update(
|
623
|
-
filepath_from_file(self)
|
624
|
-
)
|
625
|
-
if local_path.exists() and self.suffix == ".h5ad":
|
626
|
-
return ad.read_h5ad(local_path, backed="r")
|
627
634
|
|
628
635
|
from lamindb.dev.storage._backed_access import backed_access
|
629
636
|
|
630
|
-
|
637
|
+
_track_run_input(self, is_run_input)
|
638
|
+
|
639
|
+
filepath = filepath_from_file(self)
|
640
|
+
# consider the case where an object is already locally cached
|
641
|
+
localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
|
642
|
+
if localpath.exists():
|
643
|
+
return backed_access(localpath)
|
644
|
+
else:
|
645
|
+
return backed_access(filepath)
|
631
646
|
|
632
647
|
|
633
648
|
def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
@@ -638,9 +653,14 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
|
638
653
|
# avoid cycles (a file is both input and output)
|
639
654
|
if file.run != context.run:
|
640
655
|
if settings.track_run_inputs:
|
656
|
+
transform_note = ""
|
657
|
+
if file.transform is not None:
|
658
|
+
transform_note = (
|
659
|
+
f", adding parent transform {file.transform.id}"
|
660
|
+
)
|
641
661
|
logger.info(
|
642
|
-
f"Adding file {file.id} as input for run
|
643
|
-
f"
|
662
|
+
f"Adding file {file.id} as input for run"
|
663
|
+
f" {context.run.id}{transform_note}"
|
644
664
|
)
|
645
665
|
track_run_input = True
|
646
666
|
else:
|
@@ -659,7 +679,7 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
|
659
679
|
if context.run is None:
|
660
680
|
raise ValueError(
|
661
681
|
"No global run context set. Call ln.context.track() or link input to a"
|
662
|
-
" run object via `run.
|
682
|
+
" run object via `run.input_files.append(file)`"
|
663
683
|
)
|
664
684
|
# avoid adding the same run twice
|
665
685
|
# avoid cycles (a file is both input and output)
|
@@ -671,6 +691,8 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
|
671
691
|
|
672
692
|
def load(self, is_run_input: Optional[bool] = None, stream: bool = False) -> DataLike:
|
673
693
|
_track_run_input(self, is_run_input)
|
694
|
+
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
695
|
+
return self._memory_rep
|
674
696
|
return load_to_memory(filepath_from_file(self), stream=stream)
|
675
697
|
|
676
698
|
|
@@ -718,14 +740,21 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
|
|
718
740
|
if file.run is not None:
|
719
741
|
file.run.save()
|
720
742
|
if hasattr(file, "_feature_sets"):
|
721
|
-
for feature_set in file._feature_sets:
|
743
|
+
for feature_set in file._feature_sets.values():
|
722
744
|
feature_set.save()
|
723
|
-
if hasattr(file, "_feature_values"):
|
724
|
-
for feature_value in file._feature_values:
|
725
|
-
feature_value.save()
|
726
745
|
super(File, file).save(*args, **kwargs)
|
727
746
|
if hasattr(file, "_feature_sets"):
|
728
|
-
|
747
|
+
links = []
|
748
|
+
for slot, feature_set in file._feature_sets.items():
|
749
|
+
links.append(
|
750
|
+
File.feature_sets.through(
|
751
|
+
file_id=file.id, feature_set_id=feature_set.id, slot=slot
|
752
|
+
)
|
753
|
+
)
|
754
|
+
|
755
|
+
from lamindb._save import bulk_create
|
756
|
+
|
757
|
+
bulk_create(links)
|
729
758
|
|
730
759
|
|
731
760
|
def path(self) -> Union[Path, UPath]:
|
@@ -801,13 +830,13 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
801
830
|
>>> file1.save()
|
802
831
|
>>> file2 = ln.File(pd.DataFrame(index=[2,3]))
|
803
832
|
>>> file2.save()
|
804
|
-
>>> ln.save(ln.
|
805
|
-
>>>
|
806
|
-
>>> file1.
|
807
|
-
>>> file2.inherit_relations(file1, ["
|
808
|
-
💬 Inheriting 1 field: ['
|
809
|
-
>>> file2.
|
810
|
-
['
|
833
|
+
>>> ln.save(ln.Label.from_values(["Label1", "Label2", "Label3"], field="name"))
|
834
|
+
>>> labels = ln.Label.select(name__icontains = "label").all()
|
835
|
+
>>> file1.labels.set(labels)
|
836
|
+
>>> file2.inherit_relations(file1, ["labels"])
|
837
|
+
💬 Inheriting 1 field: ['labels']
|
838
|
+
>>> file2.labels.list("name")
|
839
|
+
['Label1', 'Label2', 'Label3']
|
811
840
|
"""
|
812
841
|
if fields is None:
|
813
842
|
# fields in the model definition
|
@@ -822,6 +851,9 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
822
851
|
else:
|
823
852
|
raise KeyError(f"No many-to-many relationship is found with '{field}'")
|
824
853
|
|
854
|
+
if None in related_names:
|
855
|
+
related_names.remove(None)
|
856
|
+
|
825
857
|
inherit_names = [
|
826
858
|
related_name
|
827
859
|
for related_name in related_names
|
@@ -836,6 +868,15 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
836
868
|
)
|
837
869
|
|
838
870
|
|
871
|
+
@property # type: ignore
|
872
|
+
@doc_args(File.features.__doc__)
|
873
|
+
def features(self) -> "FeatureManager":
|
874
|
+
"""{}"""
|
875
|
+
from lamindb._feature_manager import FeatureManager
|
876
|
+
|
877
|
+
return FeatureManager(self)
|
878
|
+
|
879
|
+
|
839
880
|
METHOD_NAMES = [
|
840
881
|
"__init__",
|
841
882
|
"from_anndata",
|
@@ -866,5 +907,8 @@ for name in METHOD_NAMES:
|
|
866
907
|
# privates currently dealt with separately
|
867
908
|
File._delete_skip_storage = _delete_skip_storage
|
868
909
|
File._save_skip_storage = _save_skip_storage
|
910
|
+
# TODO: move these to METHOD_NAMES
|
869
911
|
setattr(File, "view_lineage", view_lineage)
|
870
912
|
setattr(File, "inherit_relations", inherit_relations)
|
913
|
+
# property signature is not tested:
|
914
|
+
setattr(File, "features", features)
|
lamindb/_from_values.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
from typing import Any, Dict, Iterable, List, Tuple, Union
|
1
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
2
2
|
|
3
3
|
import pandas as pd
|
4
4
|
from django.core.exceptions import FieldDoesNotExist
|
5
|
+
from django.db.models import Case, When
|
5
6
|
from django.db.models.query_utils import DeferredAttribute as Field
|
6
|
-
from
|
7
|
-
from lnschema_core.models import ORM
|
7
|
+
from lamin_utils import colors, logger
|
8
|
+
from lnschema_core.models import ORM, Feature
|
8
9
|
from lnschema_core.types import ListLike
|
9
10
|
|
10
11
|
from .dev._settings import settings
|
@@ -17,15 +18,26 @@ def get_or_create_records(
|
|
17
18
|
*,
|
18
19
|
from_bionty: bool = False,
|
19
20
|
**kwargs,
|
20
|
-
) -> List:
|
21
|
+
) -> List[ORM]:
|
21
22
|
"""Get or create records from iterables."""
|
22
23
|
upon_create_search_names = settings.upon_create_search_names
|
23
24
|
settings.upon_create_search_names = False
|
25
|
+
feature: Feature = None
|
26
|
+
if "feature" in kwargs:
|
27
|
+
feature = kwargs.pop("feature")
|
28
|
+
kwargs["feature_id"] = feature.id
|
29
|
+
types: Optional[Dict] = None
|
30
|
+
if "types" in kwargs:
|
31
|
+
types = kwargs.pop("types")
|
24
32
|
try:
|
25
33
|
field_name = field.field.name
|
26
|
-
|
34
|
+
ORM = field.field.model
|
27
35
|
iterable_idx = index_iterable(iterable)
|
28
36
|
|
37
|
+
if isinstance(ORM, Feature):
|
38
|
+
if types is None:
|
39
|
+
raise ValueError("Please pass types as {} or use FeatureSet.from_df()")
|
40
|
+
|
29
41
|
# returns existing records & non-existing values
|
30
42
|
records, nonexist_values = get_existing_records(
|
31
43
|
iterable_idx=iterable_idx, field=field, kwargs=kwargs
|
@@ -43,15 +55,42 @@ def get_or_create_records(
|
|
43
55
|
# unmapped new_ids will only create records with field and kwargs
|
44
56
|
if len(unmapped_values) > 0:
|
45
57
|
for value in unmapped_values:
|
46
|
-
|
58
|
+
params = {field_name: value}
|
59
|
+
if types is not None:
|
60
|
+
params["type"] = str(types[value])
|
61
|
+
records.append(ORM(**params, **kwargs))
|
47
62
|
s = "" if len(unmapped_values) == 1 else "s"
|
48
|
-
print_unmapped_values = ", ".join(unmapped_values[:
|
49
|
-
if len(unmapped_values) >
|
63
|
+
print_unmapped_values = ", ".join(unmapped_values[:10])
|
64
|
+
if len(unmapped_values) > 10:
|
50
65
|
print_unmapped_values += ", ..."
|
66
|
+
additional_info = " "
|
67
|
+
if feature is not None:
|
68
|
+
additional_info = f" Feature {feature.name} and "
|
51
69
|
logger.warning(
|
52
|
-
f"Created {colors.yellow(f'{len(unmapped_values)} {
|
53
|
-
f"
|
70
|
+
f"Created {colors.yellow(f'{len(unmapped_values)} {ORM.__name__} record{s}')} for{additional_info}" # noqa
|
71
|
+
f"{colors.yellow(f'{field_name}{s}')}: {print_unmapped_values}" # noqa
|
54
72
|
)
|
73
|
+
if ORM.__module__.startswith("lnschema_bionty."):
|
74
|
+
if isinstance(iterable, pd.Series):
|
75
|
+
feature = iterable.name
|
76
|
+
else:
|
77
|
+
logger.warning(
|
78
|
+
"Did not receive values as pd.Series, inferring feature from"
|
79
|
+
f" reference ORM: {ORM.__name__}"
|
80
|
+
)
|
81
|
+
feature = ORM.__name__.lower()
|
82
|
+
if isinstance(feature, str):
|
83
|
+
feature_name = feature
|
84
|
+
feature = Feature.select(name=feature).one_or_none()
|
85
|
+
elif feature is not None:
|
86
|
+
feature_name = feature.name
|
87
|
+
if feature is not None:
|
88
|
+
for record in records:
|
89
|
+
record._feature = feature
|
90
|
+
if feature_name is not None:
|
91
|
+
for record in records:
|
92
|
+
record._feature = feature_name
|
93
|
+
logger.info(f"Mapping records to feature '{feature_name}'")
|
55
94
|
return records
|
56
95
|
finally:
|
57
96
|
settings.upon_create_search_names = upon_create_search_names
|
@@ -80,10 +119,14 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
80
119
|
syn_msg = ""
|
81
120
|
if len(syn_mapper) > 0:
|
82
121
|
s = "" if len(syn_mapper) == 1 else "s"
|
122
|
+
names = list(syn_mapper.keys())
|
123
|
+
print_values = ", ".join(names[:10])
|
124
|
+
if len(names) > 10:
|
125
|
+
print_values += ", ..."
|
83
126
|
syn_msg = (
|
84
127
|
"Loaded"
|
85
128
|
f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')} that" # noqa
|
86
|
-
f" matched {colors.green('synonyms')}"
|
129
|
+
f" matched {colors.green('synonyms')}: {print_values}"
|
87
130
|
)
|
88
131
|
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
89
132
|
|
@@ -95,22 +138,37 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
95
138
|
|
96
139
|
from ._select import select
|
97
140
|
|
98
|
-
|
141
|
+
query_set = select(model, **condition)
|
142
|
+
|
143
|
+
# new we have to sort the list of queried records
|
144
|
+
preserved = Case(
|
145
|
+
*[
|
146
|
+
When(**{field_name: value}, then=pos)
|
147
|
+
for pos, value in enumerate(iterable_idx)
|
148
|
+
]
|
149
|
+
)
|
150
|
+
records = query_set.order_by(preserved).list()
|
99
151
|
|
100
|
-
records = stmt.list() # existing records
|
101
152
|
n_name = len(records) - len(syn_mapper)
|
153
|
+
names = [getattr(record, field_name) for record in records]
|
154
|
+
names = [name for name in names if name not in syn_mapper.values()]
|
102
155
|
if n_name > 0:
|
103
156
|
s = "" if n_name == 1 else "s"
|
157
|
+
print_values = ", ".join(names[:10])
|
158
|
+
if len(names) > 10:
|
159
|
+
print_values += ", ..."
|
104
160
|
logger.info(
|
105
161
|
"Loaded"
|
106
162
|
f" {colors.green(f'{n_name} {model.__name__} record{s}')} that"
|
107
|
-
f" matched
|
163
|
+
f" matched {colors.green(f'{field_name}')}: {print_values}"
|
108
164
|
)
|
109
165
|
# make sure that synonyms logging appears after the field logging
|
110
166
|
if len(syn_msg) > 0:
|
111
167
|
logger.info(syn_msg)
|
112
168
|
|
113
|
-
existing_values = iterable_idx.intersection(
|
169
|
+
existing_values = iterable_idx.intersection(
|
170
|
+
query_set.values_list(field_name, flat=True)
|
171
|
+
)
|
114
172
|
nonexist_values = iterable_idx.difference(existing_values)
|
115
173
|
|
116
174
|
return records, nonexist_values
|
@@ -144,10 +202,14 @@ def create_records_from_bionty(
|
|
144
202
|
msg_syn: str = ""
|
145
203
|
if len(syn_mapper) > 0:
|
146
204
|
s = "" if len(syn_mapper) == 1 else "s"
|
205
|
+
names = list(syn_mapper.keys())
|
206
|
+
print_values = ", ".join(names[:10])
|
207
|
+
if len(names) > 10:
|
208
|
+
print_values += ", ..."
|
147
209
|
msg_syn = (
|
148
|
-
"
|
210
|
+
"Loaded"
|
149
211
|
f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')} that" # noqa
|
150
|
-
f" matched {colors.purple('synonyms')}"
|
212
|
+
f" matched {colors.purple('synonyms')}: {print_values}"
|
151
213
|
)
|
152
214
|
|
153
215
|
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
@@ -162,26 +224,24 @@ def create_records_from_bionty(
|
|
162
224
|
for bk in bionty_kwargs:
|
163
225
|
records.append(model(**bk, **kwargs))
|
164
226
|
|
165
|
-
# logging of BiontySource linking
|
166
|
-
source_msg = (
|
167
|
-
""
|
168
|
-
if kwargs.get("bionty_source") is None
|
169
|
-
else f" (bionty_source_id={kwargs.get('bionty_source').id})" # type:ignore # noqa
|
170
|
-
)
|
171
|
-
|
172
227
|
# number of records that matches field (not synonyms)
|
173
228
|
n_name = len(records) - len(syn_mapper)
|
229
|
+
names = [getattr(record, field_name) for record in records]
|
230
|
+
names = [name for name in names if name not in syn_mapper.values()]
|
174
231
|
if n_name > 0:
|
175
232
|
s = "" if n_name == 1 else "s"
|
233
|
+
print_values = ", ".join(names[:10])
|
234
|
+
if len(names) > 10:
|
235
|
+
print_values += ", ..."
|
176
236
|
msg = (
|
177
|
-
"
|
237
|
+
"Loaded"
|
178
238
|
f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')} that" # noqa
|
179
|
-
f" matched {colors.purple(f'{field_name}')}
|
239
|
+
f" matched {colors.purple(f'{field_name}')}: {print_values}"
|
180
240
|
)
|
181
|
-
logger.info(msg
|
241
|
+
logger.info(msg)
|
182
242
|
# make sure that synonyms logging appears after the field logging
|
183
243
|
if len(msg_syn) > 0:
|
184
|
-
logger.info(msg_syn
|
244
|
+
logger.info(msg_syn)
|
185
245
|
# warning about multi matches
|
186
246
|
if len(multi_msg) > 0:
|
187
247
|
logger.warning(multi_msg)
|
lamindb/_label.py
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
from lamin_utils import logger
|
5
|
+
from lamindb_setup.dev._docs import doc_args
|
6
|
+
from lnschema_core import Feature, Label
|
7
|
+
from lnschema_core.types import ListLike
|
8
|
+
|
9
|
+
from lamindb.dev.utils import attach_func_to_class_method
|
10
|
+
|
11
|
+
from . import _TESTING
|
12
|
+
from ._from_values import get_or_create_records, index_iterable
|
13
|
+
|
14
|
+
|
15
|
+
def __init__(self, *args, **kwargs):
|
16
|
+
if len(args) == len(self._meta.concrete_fields):
|
17
|
+
super(Label, self).__init__(*args, **kwargs)
|
18
|
+
return None
|
19
|
+
# now we proceed with the user-facing constructor
|
20
|
+
if len(args) > 0:
|
21
|
+
raise ValueError("Only one non-keyword arg allowed")
|
22
|
+
name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
|
23
|
+
description: Optional[str] = (
|
24
|
+
kwargs.pop("description") if "description" in kwargs else None
|
25
|
+
)
|
26
|
+
feature: Optional[str] = kwargs.pop("feature") if "feature" in kwargs else None
|
27
|
+
feature_id: Optional[str] = (
|
28
|
+
kwargs.pop("feature_id") if "feature_id" in kwargs else None
|
29
|
+
)
|
30
|
+
if len(kwargs) > 0:
|
31
|
+
raise ValueError("Only name, description, feature are valid keyword arguments")
|
32
|
+
# continue
|
33
|
+
if feature is None and feature_id is None:
|
34
|
+
logger.warning("Consider passing a corresponding feature for your label!")
|
35
|
+
if isinstance(feature, str):
|
36
|
+
feature = Feature.select(name=feature).one_or_none()
|
37
|
+
if feature is None:
|
38
|
+
raise ValueError(
|
39
|
+
f"Feature with name {feature} does not exist, please create it:"
|
40
|
+
f" ln.Feature(name={feature}, type='float')"
|
41
|
+
)
|
42
|
+
else:
|
43
|
+
feature_id = feature.id
|
44
|
+
super(Label, self).__init__(
|
45
|
+
name=name, description=description, feature_id=feature_id
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
@classmethod # type:ignore
|
50
|
+
@doc_args(Label.from_values.__doc__)
|
51
|
+
def from_values(
|
52
|
+
cls, values: ListLike, feature: Optional[Union[Feature, str]] = None, **kwargs
|
53
|
+
) -> List["Label"]:
|
54
|
+
"""{}"""
|
55
|
+
iterable_idx = index_iterable(values)
|
56
|
+
if feature is None and isinstance(values, pd.Series):
|
57
|
+
feature = values.name
|
58
|
+
if isinstance(feature, str):
|
59
|
+
feature = Feature.select(name=feature).one()
|
60
|
+
records = get_or_create_records(
|
61
|
+
iterable=iterable_idx,
|
62
|
+
field=Label.name,
|
63
|
+
# here, feature_id is a kwarg, which is an additional condition
|
64
|
+
# in queries for potentially existing records
|
65
|
+
feature=feature,
|
66
|
+
)
|
67
|
+
return records
|
68
|
+
|
69
|
+
|
70
|
+
METHOD_NAMES = [
|
71
|
+
"__init__",
|
72
|
+
"from_values",
|
73
|
+
]
|
74
|
+
|
75
|
+
if _TESTING:
|
76
|
+
from inspect import signature
|
77
|
+
|
78
|
+
SIGS = {
|
79
|
+
name: signature(getattr(Label, name))
|
80
|
+
for name in METHOD_NAMES
|
81
|
+
if name != "__init__"
|
82
|
+
}
|
83
|
+
|
84
|
+
for name in METHOD_NAMES:
|
85
|
+
attach_func_to_class_method(name, Label, globals())
|
lamindb/_logger.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from
|
1
|
+
from lamin_utils import colors, logger # noqa
|