lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +114 -113
- lamindb/_artifact.py +1206 -1205
- lamindb/_can_validate.py +621 -579
- lamindb/_collection.py +390 -387
- lamindb/_curate.py +1603 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +244 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +250 -256
- lamindb/_from_values.py +403 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +364 -362
- lamindb/_record.py +668 -649
- lamindb/_run.py +60 -57
- lamindb/_save.py +310 -308
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +130 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -94
- lamindb/core/_context.py +590 -574
- lamindb/core/_data.py +510 -438
- lamindb/core/_django.py +209 -0
- lamindb/core/_feature_manager.py +994 -867
- lamindb/core/_label_manager.py +289 -253
- lamindb/core/_mapped_collection.py +631 -597
- lamindb/core/_settings.py +188 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +581 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -90
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -164
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +741 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -204
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -172
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +146 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
- lamindb-0.76.10.dist-info/RECORD +61 -0
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
- lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/core/_feature_manager.py
CHANGED
@@ -1,867 +1,994 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from collections import defaultdict
|
4
|
-
from collections.abc import Iterable
|
5
|
-
from itertools import compress
|
6
|
-
from typing import TYPE_CHECKING, Any
|
7
|
-
|
8
|
-
import anndata as ad
|
9
|
-
import numpy as np
|
10
|
-
import pandas as pd
|
11
|
-
from anndata import AnnData
|
12
|
-
from django.contrib.postgres.aggregates import ArrayAgg
|
13
|
-
from django.db import connections
|
14
|
-
from django.db.models import Aggregate
|
15
|
-
from lamin_utils import colors, logger
|
16
|
-
from lamindb_setup.core.upath import create_path
|
17
|
-
from lnschema_core.models import (
|
18
|
-
Artifact,
|
19
|
-
Collection,
|
20
|
-
Feature,
|
21
|
-
FeatureManager,
|
22
|
-
FeatureValue,
|
23
|
-
LinkORM,
|
24
|
-
Param,
|
25
|
-
ParamManager,
|
26
|
-
ParamManagerArtifact,
|
27
|
-
ParamManagerRun,
|
28
|
-
ParamValue,
|
29
|
-
Record,
|
30
|
-
Run,
|
31
|
-
ULabel,
|
32
|
-
)
|
33
|
-
|
34
|
-
from lamindb._feature import FEATURE_TYPES, convert_numpy_dtype_to_lamin_feature_type
|
35
|
-
from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
|
36
|
-
from lamindb._record import (
|
37
|
-
REGISTRY_UNIQUE_FIELD,
|
38
|
-
get_name_field,
|
39
|
-
transfer_fk_to_default_db_bulk,
|
40
|
-
transfer_to_default_db,
|
41
|
-
)
|
42
|
-
from lamindb._save import save
|
43
|
-
from lamindb.core.exceptions import ValidationError
|
44
|
-
from lamindb.core.storage import LocalPathClasses
|
45
|
-
|
46
|
-
from .
|
47
|
-
from .
|
48
|
-
from .
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
dictionary["
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
if host
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
.
|
90
|
-
.
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
#
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
.
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
if
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
if
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
)
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
def
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
)
|
480
|
-
|
481
|
-
registry
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
)
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
if
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
)
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
):
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
)
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections import defaultdict
|
4
|
+
from collections.abc import Iterable
|
5
|
+
from itertools import compress
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
import anndata as ad
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
from anndata import AnnData
|
12
|
+
from django.contrib.postgres.aggregates import ArrayAgg
|
13
|
+
from django.db import connections
|
14
|
+
from django.db.models import Aggregate
|
15
|
+
from lamin_utils import colors, logger
|
16
|
+
from lamindb_setup.core.upath import create_path
|
17
|
+
from lnschema_core.models import (
|
18
|
+
Artifact,
|
19
|
+
Collection,
|
20
|
+
Feature,
|
21
|
+
FeatureManager,
|
22
|
+
FeatureValue,
|
23
|
+
LinkORM,
|
24
|
+
Param,
|
25
|
+
ParamManager,
|
26
|
+
ParamManagerArtifact,
|
27
|
+
ParamManagerRun,
|
28
|
+
ParamValue,
|
29
|
+
Record,
|
30
|
+
Run,
|
31
|
+
ULabel,
|
32
|
+
)
|
33
|
+
|
34
|
+
from lamindb._feature import FEATURE_TYPES, convert_numpy_dtype_to_lamin_feature_type
|
35
|
+
from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
|
36
|
+
from lamindb._record import (
|
37
|
+
REGISTRY_UNIQUE_FIELD,
|
38
|
+
get_name_field,
|
39
|
+
transfer_fk_to_default_db_bulk,
|
40
|
+
transfer_to_default_db,
|
41
|
+
)
|
42
|
+
from lamindb._save import save
|
43
|
+
from lamindb.core.exceptions import ValidationError
|
44
|
+
from lamindb.core.storage import LocalPathClasses
|
45
|
+
|
46
|
+
from ._django import get_artifact_with_related
|
47
|
+
from ._label_manager import get_labels_as_dict
|
48
|
+
from ._settings import settings
|
49
|
+
from .schema import (
|
50
|
+
dict_related_model_to_related_name,
|
51
|
+
)
|
52
|
+
|
53
|
+
if TYPE_CHECKING:
|
54
|
+
from lnschema_core.types import FieldAttr
|
55
|
+
|
56
|
+
from lamindb._query_set import QuerySet
|
57
|
+
|
58
|
+
|
59
|
+
def get_host_id_field(host: Artifact | Collection) -> str:
|
60
|
+
if isinstance(host, Artifact):
|
61
|
+
host_id_field = "artifact_id"
|
62
|
+
else:
|
63
|
+
host_id_field = "collection_id"
|
64
|
+
return host_id_field
|
65
|
+
|
66
|
+
|
67
|
+
def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
68
|
+
dictionary = {
|
69
|
+
field.related_model.__get_name_with_schema__(): field.name
|
70
|
+
for field in host._meta.related_objects
|
71
|
+
}
|
72
|
+
dictionary["Feature"] = "features"
|
73
|
+
dictionary["ULabel"] = "ulabels"
|
74
|
+
return dictionary
|
75
|
+
|
76
|
+
|
77
|
+
def get_feature_set_by_slot_(host) -> dict:
|
78
|
+
# if the host is not yet saved
|
79
|
+
if host._state.adding:
|
80
|
+
if hasattr(host, "_feature_sets"):
|
81
|
+
return host._feature_sets
|
82
|
+
else:
|
83
|
+
return {}
|
84
|
+
host_db = host._state.db
|
85
|
+
host_id_field = get_host_id_field(host)
|
86
|
+
kwargs = {host_id_field: host.id}
|
87
|
+
# otherwise, we need a query
|
88
|
+
links_feature_set = (
|
89
|
+
host.feature_sets.through.objects.using(host_db)
|
90
|
+
.filter(**kwargs)
|
91
|
+
.select_related("featureset")
|
92
|
+
)
|
93
|
+
return {fsl.slot: fsl.featureset for fsl in links_feature_set}
|
94
|
+
|
95
|
+
|
96
|
+
def get_label_links(
|
97
|
+
host: Artifact | Collection, registry: str, feature: Feature
|
98
|
+
) -> QuerySet:
|
99
|
+
host_id_field = get_host_id_field(host)
|
100
|
+
kwargs = {host_id_field: host.id, "feature_id": feature.id}
|
101
|
+
link_records = (
|
102
|
+
getattr(host, host.features._accessor_by_registry[registry])
|
103
|
+
.through.objects.using(host._state.db)
|
104
|
+
.filter(**kwargs)
|
105
|
+
)
|
106
|
+
return link_records
|
107
|
+
|
108
|
+
|
109
|
+
def get_feature_set_links(host: Artifact | Collection) -> QuerySet:
|
110
|
+
host_id_field = get_host_id_field(host)
|
111
|
+
kwargs = {host_id_field: host.id}
|
112
|
+
links_feature_set = host.feature_sets.through.objects.filter(**kwargs)
|
113
|
+
return links_feature_set
|
114
|
+
|
115
|
+
|
116
|
+
def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
|
117
|
+
link_model_name = link.__class__.__name__
|
118
|
+
if link_model_name in {"Registry", "ModelBase"}: # we passed the type of the link
|
119
|
+
link_model_name = link.__name__
|
120
|
+
return link_model_name.replace(data.__class__.__name__, "").lower()
|
121
|
+
|
122
|
+
|
123
|
+
# Custom aggregation for SQLite
|
124
|
+
class GroupConcat(Aggregate):
|
125
|
+
function = "GROUP_CONCAT"
|
126
|
+
template = '%(function)s(%(expressions)s, ", ")'
|
127
|
+
|
128
|
+
|
129
|
+
def custom_aggregate(field, using: str):
|
130
|
+
if connections[using].vendor == "postgresql":
|
131
|
+
return ArrayAgg(field)
|
132
|
+
else:
|
133
|
+
return GroupConcat(field)
|
134
|
+
|
135
|
+
|
136
|
+
def _print_categoricals_postgres(
|
137
|
+
self: Artifact | Collection,
|
138
|
+
related_data: dict | None = None,
|
139
|
+
print_types: bool = False,
|
140
|
+
to_dict: bool = False,
|
141
|
+
print_params: bool = False,
|
142
|
+
):
|
143
|
+
from lamindb._from_values import _print_values
|
144
|
+
|
145
|
+
if not related_data:
|
146
|
+
artifact_meta = get_artifact_with_related(
|
147
|
+
self, include_feature_link=True, include_m2m=True
|
148
|
+
)
|
149
|
+
related_data = artifact_meta.get("related_data", {})
|
150
|
+
|
151
|
+
m2m_data = related_data.get("m2m", {}) if related_data else {}
|
152
|
+
m2m_name = {}
|
153
|
+
for related_name, values in m2m_data.items():
|
154
|
+
link_model = getattr(self.__class__, related_name).through
|
155
|
+
related_model_name = link_model.__name__.replace(
|
156
|
+
self.__class__.__name__, ""
|
157
|
+
).lower()
|
158
|
+
m2m_name[related_model_name] = values
|
159
|
+
links_data = related_data.get("link", {}) if related_data else {}
|
160
|
+
feature_dict = {
|
161
|
+
id: (name, dtype)
|
162
|
+
for id, name, dtype in Feature.objects.using(self._state.db).values_list(
|
163
|
+
"id", "name", "dtype"
|
164
|
+
)
|
165
|
+
}
|
166
|
+
|
167
|
+
msg = ""
|
168
|
+
dictionary = {}
|
169
|
+
|
170
|
+
# categorical feature values
|
171
|
+
if not print_params:
|
172
|
+
labels_msg = ""
|
173
|
+
labels_msgs = []
|
174
|
+
feature_values: dict = {}
|
175
|
+
for link_name, link_values in links_data.items():
|
176
|
+
related_name = link_name.removeprefix("links_").replace("_", "")
|
177
|
+
link_model = getattr(self.__class__, link_name).rel.related_model
|
178
|
+
if not link_values:
|
179
|
+
continue
|
180
|
+
for link_value in link_values:
|
181
|
+
feature_id = link_value.get("feature")
|
182
|
+
if feature_id is None:
|
183
|
+
continue
|
184
|
+
feature_name = feature_dict.get(feature_id)[0]
|
185
|
+
if feature_name not in feature_values:
|
186
|
+
feature_values[feature_name] = (feature_dict.get(feature_id)[1], [])
|
187
|
+
label_id = link_value.get(related_name)
|
188
|
+
feature_values[feature_name][1].append(
|
189
|
+
m2m_name.get(related_name, {}).get(label_id)
|
190
|
+
)
|
191
|
+
for feature_name, (dtype, labels_list) in feature_values.items():
|
192
|
+
print_values = _print_values(labels_list, n=10)
|
193
|
+
type_str = f": {dtype}" if print_types else ""
|
194
|
+
if to_dict:
|
195
|
+
dictionary[feature_name] = (
|
196
|
+
labels_list if len(labels_list) > 1 else labels_list[0]
|
197
|
+
)
|
198
|
+
labels_msgs.append(f" '{feature_name}'{type_str} = {print_values}")
|
199
|
+
if len(labels_msgs) > 0:
|
200
|
+
labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
|
201
|
+
msg += labels_msg
|
202
|
+
return msg, dictionary
|
203
|
+
|
204
|
+
|
205
|
+
def _print_categoricals(
|
206
|
+
self: Artifact | Collection,
|
207
|
+
print_types: bool = False,
|
208
|
+
to_dict: bool = False,
|
209
|
+
print_params: bool = False,
|
210
|
+
):
|
211
|
+
from lamindb._from_values import _print_values
|
212
|
+
|
213
|
+
msg = ""
|
214
|
+
dictionary = {}
|
215
|
+
# categorical feature values
|
216
|
+
if not print_params:
|
217
|
+
labels_msg = ""
|
218
|
+
labels_by_feature = defaultdict(list)
|
219
|
+
for _, (_, links) in get_labels_as_dict(self, links=True).items():
|
220
|
+
for link in links:
|
221
|
+
if link.feature_id is not None:
|
222
|
+
link_attr = get_link_attr(link, self)
|
223
|
+
labels_by_feature[link.feature_id].append(
|
224
|
+
getattr(link, link_attr).name
|
225
|
+
)
|
226
|
+
labels_msgs = []
|
227
|
+
for feature_id, labels_list in labels_by_feature.items():
|
228
|
+
feature = Feature.objects.using(self._state.db).get(id=feature_id)
|
229
|
+
print_values = _print_values(labels_list, n=10)
|
230
|
+
type_str = f": {feature.dtype}" if print_types else ""
|
231
|
+
if to_dict:
|
232
|
+
dictionary[feature.name] = (
|
233
|
+
labels_list if len(labels_list) > 1 else labels_list[0]
|
234
|
+
)
|
235
|
+
labels_msgs.append(f" '{feature.name}'{type_str} = {print_values}")
|
236
|
+
if len(labels_msgs) > 0:
|
237
|
+
labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
|
238
|
+
msg += labels_msg
|
239
|
+
return msg, dictionary
|
240
|
+
|
241
|
+
|
242
|
+
def _print_featuresets_postgres(
|
243
|
+
self: Artifact | Collection,
|
244
|
+
related_data: dict | None = None,
|
245
|
+
print_types: bool = False,
|
246
|
+
):
|
247
|
+
from lamindb._from_values import _print_values
|
248
|
+
|
249
|
+
if not related_data:
|
250
|
+
artifact_meta = get_artifact_with_related(self, include_featureset=True)
|
251
|
+
related_data = artifact_meta.get("related_data", {})
|
252
|
+
|
253
|
+
fs_data = related_data.get("featuresets", {}) if related_data else {}
|
254
|
+
feature_set_msg = ""
|
255
|
+
for _, (slot, data) in fs_data.items():
|
256
|
+
for type_str, feature_names in data.items():
|
257
|
+
type_str = f": {type_str}" if print_types else ""
|
258
|
+
feature_set_msg += (
|
259
|
+
f" '{slot}'{type_str} = {_print_values(feature_names)}\n"
|
260
|
+
)
|
261
|
+
|
262
|
+
return feature_set_msg
|
263
|
+
|
264
|
+
|
265
|
+
def print_features(
|
266
|
+
self: Artifact | Collection,
|
267
|
+
related_data: dict | None = None,
|
268
|
+
print_types: bool = False,
|
269
|
+
to_dict: bool = False,
|
270
|
+
print_params: bool = False,
|
271
|
+
) -> str | dict[str, Any]:
|
272
|
+
from lamindb._from_values import _print_values
|
273
|
+
|
274
|
+
if not self._state.adding and connections[self._state.db].vendor == "postgresql":
|
275
|
+
msg, dictionary = _print_categoricals_postgres(
|
276
|
+
self,
|
277
|
+
related_data=related_data,
|
278
|
+
print_types=print_types,
|
279
|
+
to_dict=to_dict,
|
280
|
+
print_params=print_params,
|
281
|
+
)
|
282
|
+
else:
|
283
|
+
msg, dictionary = _print_categoricals(
|
284
|
+
self,
|
285
|
+
print_types=print_types,
|
286
|
+
to_dict=to_dict,
|
287
|
+
print_params=print_params,
|
288
|
+
)
|
289
|
+
|
290
|
+
# non-categorical feature values
|
291
|
+
non_labels_msg = ""
|
292
|
+
if self.id is not None and self.__class__ == Artifact or self.__class__ == Run:
|
293
|
+
attr_name = "param" if print_params else "feature"
|
294
|
+
_feature_values = (
|
295
|
+
getattr(self, f"_{attr_name}_values")
|
296
|
+
.values(f"{attr_name}__name", f"{attr_name}__dtype")
|
297
|
+
.annotate(values=custom_aggregate("value", self._state.db))
|
298
|
+
.order_by(f"{attr_name}__name")
|
299
|
+
)
|
300
|
+
if len(_feature_values) > 0:
|
301
|
+
for fv in _feature_values:
|
302
|
+
feature_name = fv[f"{attr_name}__name"]
|
303
|
+
feature_dtype = fv[f"{attr_name}__dtype"]
|
304
|
+
values = fv["values"]
|
305
|
+
# TODO: understand why the below is necessary
|
306
|
+
if not isinstance(values, list):
|
307
|
+
values = [values]
|
308
|
+
if to_dict:
|
309
|
+
dictionary[feature_name] = values if len(values) > 1 else values[0]
|
310
|
+
type_str = f": {feature_dtype}" if print_types else ""
|
311
|
+
printed_values = (
|
312
|
+
_print_values(values, n=10, quotes=False)
|
313
|
+
if not feature_dtype.startswith("list")
|
314
|
+
else values
|
315
|
+
)
|
316
|
+
non_labels_msg += f" '{feature_name}'{type_str} = {printed_values}\n"
|
317
|
+
msg += non_labels_msg
|
318
|
+
|
319
|
+
if msg != "":
|
320
|
+
header = "Features" if not print_params else "Params"
|
321
|
+
msg = f" {colors.italic(header)}\n" + msg
|
322
|
+
|
323
|
+
# feature sets
|
324
|
+
if not print_params:
|
325
|
+
feature_set_msg = ""
|
326
|
+
if self.id is not None and connections[self._state.db].vendor == "postgresql":
|
327
|
+
feature_set_msg = _print_featuresets_postgres(
|
328
|
+
self, related_data=related_data
|
329
|
+
)
|
330
|
+
else:
|
331
|
+
for slot, feature_set in get_feature_set_by_slot_(self).items():
|
332
|
+
features = feature_set.members
|
333
|
+
# features.first() is a lot slower than features[0] here
|
334
|
+
name_field = get_name_field(features[0])
|
335
|
+
feature_names = list(features.values_list(name_field, flat=True)[:20])
|
336
|
+
type_str = f": {feature_set.registry}" if print_types else ""
|
337
|
+
feature_set_msg += (
|
338
|
+
f" '{slot}'{type_str} = {_print_values(feature_names)}\n"
|
339
|
+
)
|
340
|
+
if feature_set_msg:
|
341
|
+
msg += f" {colors.italic('Feature sets')}\n"
|
342
|
+
msg += feature_set_msg
|
343
|
+
if to_dict:
|
344
|
+
return dictionary
|
345
|
+
else:
|
346
|
+
return msg
|
347
|
+
|
348
|
+
|
349
|
+
def parse_feature_sets_from_anndata(
|
350
|
+
adata: AnnData,
|
351
|
+
var_field: FieldAttr | None = None,
|
352
|
+
obs_field: FieldAttr = Feature.name,
|
353
|
+
mute: bool = False,
|
354
|
+
organism: str | Record | None = None,
|
355
|
+
) -> dict:
|
356
|
+
data_parse = adata
|
357
|
+
if not isinstance(adata, AnnData): # is a path
|
358
|
+
filepath = create_path(adata) # returns Path for local
|
359
|
+
if not isinstance(filepath, LocalPathClasses):
|
360
|
+
from lamindb.core.storage._backed_access import backed_access
|
361
|
+
|
362
|
+
using_key = settings._using_key
|
363
|
+
data_parse = backed_access(filepath, using_key=using_key)
|
364
|
+
else:
|
365
|
+
data_parse = ad.read_h5ad(filepath, backed="r")
|
366
|
+
type = "float"
|
367
|
+
else:
|
368
|
+
type = (
|
369
|
+
"float"
|
370
|
+
if adata.X is None
|
371
|
+
else convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
|
372
|
+
)
|
373
|
+
feature_sets = {}
|
374
|
+
if var_field is not None:
|
375
|
+
logger.info("parsing feature names of X stored in slot 'var'")
|
376
|
+
logger.indent = " "
|
377
|
+
feature_set_var = FeatureSet.from_values(
|
378
|
+
data_parse.var.index,
|
379
|
+
var_field,
|
380
|
+
type=type,
|
381
|
+
mute=mute,
|
382
|
+
organism=organism,
|
383
|
+
raise_validation_error=False,
|
384
|
+
)
|
385
|
+
if feature_set_var is not None:
|
386
|
+
feature_sets["var"] = feature_set_var
|
387
|
+
logger.save(f"linked: {feature_set_var}")
|
388
|
+
logger.indent = ""
|
389
|
+
if feature_set_var is None:
|
390
|
+
logger.warning("skip linking features to artifact in slot 'var'")
|
391
|
+
if len(data_parse.obs.columns) > 0:
|
392
|
+
logger.info("parsing feature names of slot 'obs'")
|
393
|
+
logger.indent = " "
|
394
|
+
feature_set_obs = FeatureSet.from_df(
|
395
|
+
df=data_parse.obs,
|
396
|
+
field=obs_field,
|
397
|
+
mute=mute,
|
398
|
+
organism=organism,
|
399
|
+
)
|
400
|
+
if feature_set_obs is not None:
|
401
|
+
feature_sets["obs"] = feature_set_obs
|
402
|
+
logger.save(f"linked: {feature_set_obs}")
|
403
|
+
logger.indent = ""
|
404
|
+
if feature_set_obs is None:
|
405
|
+
logger.warning("skip linking features to artifact in slot 'obs'")
|
406
|
+
return feature_sets
|
407
|
+
|
408
|
+
|
409
|
+
def infer_feature_type_convert_json(
|
410
|
+
value: Any, mute: bool = False, str_as_ulabel: bool = True
|
411
|
+
) -> tuple[str, Any]:
|
412
|
+
if isinstance(value, bool):
|
413
|
+
return FEATURE_TYPES["bool"], value
|
414
|
+
elif isinstance(value, int):
|
415
|
+
return FEATURE_TYPES["int"], value
|
416
|
+
elif isinstance(value, float):
|
417
|
+
return FEATURE_TYPES["float"], value
|
418
|
+
elif isinstance(value, str):
|
419
|
+
if str_as_ulabel:
|
420
|
+
return FEATURE_TYPES["str"] + "[ULabel]", value
|
421
|
+
else:
|
422
|
+
return "str", value
|
423
|
+
elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
|
424
|
+
if isinstance(value, (pd.Series, np.ndarray)):
|
425
|
+
return convert_numpy_dtype_to_lamin_feature_type(
|
426
|
+
value.dtype, str_as_cat=str_as_ulabel
|
427
|
+
), list(value)
|
428
|
+
if isinstance(value, dict):
|
429
|
+
return "dict", value
|
430
|
+
if len(value) > 0: # type: ignore
|
431
|
+
first_element_type = type(next(iter(value)))
|
432
|
+
if all(isinstance(elem, first_element_type) for elem in value):
|
433
|
+
if first_element_type is bool:
|
434
|
+
return f"list[{FEATURE_TYPES['bool']}]", value
|
435
|
+
elif first_element_type is int:
|
436
|
+
return f"list[{FEATURE_TYPES['int']}]", value
|
437
|
+
elif first_element_type is float:
|
438
|
+
return f"list[{FEATURE_TYPES['float']}]", value
|
439
|
+
elif first_element_type is str:
|
440
|
+
if str_as_ulabel:
|
441
|
+
return FEATURE_TYPES["str"] + "[ULabel]", value
|
442
|
+
else:
|
443
|
+
return "list[str]", value
|
444
|
+
elif first_element_type == Record:
|
445
|
+
return (
|
446
|
+
f"cat[{first_element_type.__get_name_with_schema__()}]",
|
447
|
+
value,
|
448
|
+
)
|
449
|
+
elif isinstance(value, Record):
|
450
|
+
return (f"cat[{value.__class__.__get_name_with_schema__()}]", value)
|
451
|
+
if not mute:
|
452
|
+
logger.warning(f"cannot infer feature type of: {value}, returning '?")
|
453
|
+
return ("?", value)
|
454
|
+
|
455
|
+
|
456
|
+
def __init__(self, host: Artifact | Collection | Run):
|
457
|
+
self._host = host
|
458
|
+
self._feature_set_by_slot_ = None
|
459
|
+
self._accessor_by_registry_ = None
|
460
|
+
|
461
|
+
|
462
|
+
def __repr__(self) -> str:
|
463
|
+
return print_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
|
464
|
+
|
465
|
+
|
466
|
+
def get_values(self) -> dict[str, Any]:
|
467
|
+
"""Get feature values as a dictionary."""
|
468
|
+
return print_features(
|
469
|
+
self._host, to_dict=True, print_params=(self.__class__ == ParamManager)
|
470
|
+
) # type: ignore
|
471
|
+
|
472
|
+
|
473
|
+
def __getitem__(self, slot) -> QuerySet:
|
474
|
+
if slot not in self._feature_set_by_slot:
|
475
|
+
raise ValueError(
|
476
|
+
f"No linked feature set for slot: {slot}\nDid you get validation"
|
477
|
+
" warnings? Only features that match registered features get validated"
|
478
|
+
" and linked."
|
479
|
+
)
|
480
|
+
feature_set = self._feature_set_by_slot[slot]
|
481
|
+
orm_name = feature_set.registry
|
482
|
+
return getattr(feature_set, self._accessor_by_registry[orm_name]).all()
|
483
|
+
|
484
|
+
|
485
|
+
def filter_base(cls, **expression):
|
486
|
+
if cls is FeatureManager:
|
487
|
+
model = Feature
|
488
|
+
value_model = FeatureValue
|
489
|
+
else:
|
490
|
+
model = Param
|
491
|
+
value_model = ParamValue
|
492
|
+
keys_normalized = [key.split("__")[0] for key in expression]
|
493
|
+
validated = model.validate(keys_normalized, field="name", mute=True)
|
494
|
+
if sum(validated) != len(keys_normalized):
|
495
|
+
raise ValidationError(
|
496
|
+
f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}"
|
497
|
+
)
|
498
|
+
new_expression = {}
|
499
|
+
features = model.filter(name__in=keys_normalized).all().distinct()
|
500
|
+
feature_param = "param" if model is Param else "feature"
|
501
|
+
for key, value in expression.items():
|
502
|
+
split_key = key.split("__")
|
503
|
+
normalized_key = split_key[0]
|
504
|
+
comparator = ""
|
505
|
+
if len(split_key) == 2:
|
506
|
+
comparator = f"__{split_key[1]}"
|
507
|
+
feature = features.get(name=normalized_key)
|
508
|
+
if not feature.dtype.startswith("cat"):
|
509
|
+
expression = {feature_param: feature, f"value{comparator}": value}
|
510
|
+
feature_value = value_model.filter(**expression)
|
511
|
+
new_expression[f"_{feature_param}_values__in"] = feature_value
|
512
|
+
else:
|
513
|
+
if isinstance(value, str):
|
514
|
+
expression = {f"name{comparator}": value}
|
515
|
+
label = ULabel.get(**expression)
|
516
|
+
new_expression["ulabels"] = label
|
517
|
+
else:
|
518
|
+
raise NotImplementedError
|
519
|
+
if cls == FeatureManager or cls == ParamManagerArtifact:
|
520
|
+
return Artifact.filter(**new_expression)
|
521
|
+
# might renable something similar in the future
|
522
|
+
# elif cls == FeatureManagerCollection:
|
523
|
+
# return Collection.filter(**new_expression)
|
524
|
+
elif cls == ParamManagerRun:
|
525
|
+
return Run.filter(**new_expression)
|
526
|
+
|
527
|
+
|
528
|
+
@classmethod # type: ignore
|
529
|
+
def filter(cls, **expression) -> QuerySet:
|
530
|
+
"""Query artifacts by features."""
|
531
|
+
return filter_base(cls, **expression)
|
532
|
+
|
533
|
+
|
534
|
+
@classmethod # type: ignore
|
535
|
+
def get(cls, **expression) -> Record:
|
536
|
+
"""Query a single artifact by feature."""
|
537
|
+
return filter_base(cls, **expression).one()
|
538
|
+
|
539
|
+
|
540
|
+
@property # type: ignore
|
541
|
+
def _feature_set_by_slot(self):
|
542
|
+
"""Feature sets by slot."""
|
543
|
+
if self._feature_set_by_slot_ is None:
|
544
|
+
self._feature_set_by_slot_ = get_feature_set_by_slot_(self._host)
|
545
|
+
return self._feature_set_by_slot_
|
546
|
+
|
547
|
+
|
548
|
+
@property # type: ignore
|
549
|
+
def _accessor_by_registry(self):
|
550
|
+
"""Accessor by ORM."""
|
551
|
+
if self._accessor_by_registry_ is None:
|
552
|
+
self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
|
553
|
+
return self._accessor_by_registry_
|
554
|
+
|
555
|
+
|
556
|
+
def _add_values(
|
557
|
+
self,
|
558
|
+
values: dict[str, str | int | float | bool],
|
559
|
+
feature_param_field: FieldAttr,
|
560
|
+
str_as_ulabel: bool = True,
|
561
|
+
) -> None:
|
562
|
+
"""Curate artifact with features & values.
|
563
|
+
|
564
|
+
Args:
|
565
|
+
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
566
|
+
feature_param_field: The field of a reference registry to map keys of the
|
567
|
+
dictionary.
|
568
|
+
"""
|
569
|
+
# rename to distinguish from the values inside the dict
|
570
|
+
features_values = values
|
571
|
+
keys = features_values.keys()
|
572
|
+
if isinstance(keys, DICT_KEYS_TYPE):
|
573
|
+
keys = list(keys) # type: ignore
|
574
|
+
# deal with other cases later
|
575
|
+
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
576
|
+
registry = feature_param_field.field.model
|
577
|
+
is_param = registry == Param
|
578
|
+
model = Param if is_param else Feature
|
579
|
+
value_model = ParamValue if is_param else FeatureValue
|
580
|
+
model_name = "Param" if is_param else "Feature"
|
581
|
+
if is_param:
|
582
|
+
if self._host.__class__ == Artifact:
|
583
|
+
if self._host.type != "model":
|
584
|
+
raise ValidationError("Can only set params for model-like artifacts.")
|
585
|
+
else:
|
586
|
+
if self._host.__class__ == Artifact:
|
587
|
+
if self._host.type != "dataset" and self._host.type is not None:
|
588
|
+
raise ValidationError(
|
589
|
+
"Can only set features for dataset-like artifacts."
|
590
|
+
)
|
591
|
+
validated = registry.validate(keys, field=feature_param_field, mute=True)
|
592
|
+
keys_array = np.array(keys)
|
593
|
+
validated_keys = keys_array[validated]
|
594
|
+
if validated.sum() != len(keys):
|
595
|
+
not_validated_keys = keys_array[~validated]
|
596
|
+
hint = "\n".join(
|
597
|
+
[
|
598
|
+
f" ln.{model_name}(name='{key}', dtype='{infer_feature_type_convert_json(features_values[key], str_as_ulabel=str_as_ulabel)[0]}').save()"
|
599
|
+
for key in not_validated_keys
|
600
|
+
]
|
601
|
+
)
|
602
|
+
msg = (
|
603
|
+
f"These keys could not be validated: {not_validated_keys.tolist()}\n"
|
604
|
+
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
605
|
+
)
|
606
|
+
raise ValidationError(msg)
|
607
|
+
registry.from_values(
|
608
|
+
validated_keys,
|
609
|
+
field=feature_param_field,
|
610
|
+
)
|
611
|
+
# figure out which of the values go where
|
612
|
+
features_labels = defaultdict(list)
|
613
|
+
_feature_values = []
|
614
|
+
not_validated_values = []
|
615
|
+
for key, value in features_values.items():
|
616
|
+
feature = model.get(name=key)
|
617
|
+
inferred_type, converted_value = infer_feature_type_convert_json(
|
618
|
+
value,
|
619
|
+
mute=True,
|
620
|
+
str_as_ulabel=str_as_ulabel,
|
621
|
+
)
|
622
|
+
if feature.dtype == "number":
|
623
|
+
if inferred_type not in {"int", "float"}:
|
624
|
+
raise TypeError(
|
625
|
+
f"Value for feature '{key}' with type {feature.dtype} must be a number"
|
626
|
+
)
|
627
|
+
elif feature.dtype.startswith("cat"):
|
628
|
+
if inferred_type != "?":
|
629
|
+
if not (inferred_type.startswith("cat") or isinstance(value, Record)):
|
630
|
+
raise TypeError(
|
631
|
+
f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
|
632
|
+
)
|
633
|
+
elif not inferred_type == feature.dtype:
|
634
|
+
raise ValidationError(
|
635
|
+
f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
|
636
|
+
)
|
637
|
+
if not feature.dtype.startswith("cat"):
|
638
|
+
# can remove the query once we have the unique constraint
|
639
|
+
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
640
|
+
feature_value = value_model.filter(**filter_kwargs).one_or_none()
|
641
|
+
if feature_value is None:
|
642
|
+
feature_value = value_model(**filter_kwargs)
|
643
|
+
_feature_values.append(feature_value)
|
644
|
+
else:
|
645
|
+
if isinstance(value, Record) or (
|
646
|
+
isinstance(value, Iterable) and isinstance(next(iter(value)), Record)
|
647
|
+
):
|
648
|
+
if isinstance(value, Record):
|
649
|
+
label_records = [value]
|
650
|
+
else:
|
651
|
+
label_records = value # type: ignore
|
652
|
+
for record in label_records:
|
653
|
+
if record._state.adding:
|
654
|
+
raise ValidationError(
|
655
|
+
f"Please save {record} before annotation."
|
656
|
+
)
|
657
|
+
features_labels[record.__class__.__get_name_with_schema__()].append(
|
658
|
+
(feature, record)
|
659
|
+
)
|
660
|
+
else:
|
661
|
+
if isinstance(value, str):
|
662
|
+
values = [value] # type: ignore
|
663
|
+
else:
|
664
|
+
values = value # type: ignore
|
665
|
+
if "ULabel" not in feature.dtype:
|
666
|
+
feature.dtype += "[ULabel]"
|
667
|
+
feature.save()
|
668
|
+
validated = ULabel.validate(values, field="name", mute=True)
|
669
|
+
values_array = np.array(values)
|
670
|
+
validated_values = values_array[validated]
|
671
|
+
if validated.sum() != len(values):
|
672
|
+
not_validated_values += values_array[~validated].tolist()
|
673
|
+
label_records = ULabel.from_values(validated_values, field="name")
|
674
|
+
features_labels["ULabel"] += [
|
675
|
+
(feature, label_record) for label_record in label_records
|
676
|
+
]
|
677
|
+
if not_validated_values:
|
678
|
+
hint = (
|
679
|
+
f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True)\n"
|
680
|
+
f" ln.save(ulabels)"
|
681
|
+
)
|
682
|
+
msg = (
|
683
|
+
f"These values could not be validated: {not_validated_values}\n"
|
684
|
+
f"Here is how to create ulabels for them:\n\n{hint}"
|
685
|
+
)
|
686
|
+
raise ValidationError(msg)
|
687
|
+
# bulk add all links to ArtifactULabel
|
688
|
+
if features_labels:
|
689
|
+
if list(features_labels.keys()) != ["ULabel"]:
|
690
|
+
related_names = dict_related_model_to_related_name(self._host.__class__)
|
691
|
+
else:
|
692
|
+
related_names = {"ULabel": "ulabels"}
|
693
|
+
for class_name, registry_features_labels in features_labels.items():
|
694
|
+
related_name = related_names[class_name] # e.g., "ulabels"
|
695
|
+
LinkORM = getattr(self._host, related_name).through
|
696
|
+
field_name = f"{get_link_attr(LinkORM, self._host)}_id" # e.g., ulabel_id
|
697
|
+
links = [
|
698
|
+
LinkORM(
|
699
|
+
**{
|
700
|
+
"artifact_id": self._host.id,
|
701
|
+
"feature_id": feature.id,
|
702
|
+
field_name: label.id,
|
703
|
+
}
|
704
|
+
)
|
705
|
+
for (feature, label) in registry_features_labels
|
706
|
+
]
|
707
|
+
# a link might already exist
|
708
|
+
try:
|
709
|
+
save(links, ignore_conflicts=False)
|
710
|
+
except Exception:
|
711
|
+
save(links, ignore_conflicts=True)
|
712
|
+
# now deal with links that were previously saved without a feature_id
|
713
|
+
links_saved = LinkORM.filter(
|
714
|
+
**{
|
715
|
+
"artifact_id": self._host.id,
|
716
|
+
f"{field_name}__in": [
|
717
|
+
l.id for _, l in registry_features_labels
|
718
|
+
],
|
719
|
+
}
|
720
|
+
)
|
721
|
+
for link in links_saved.all():
|
722
|
+
# TODO: also check for inconsistent features
|
723
|
+
if link.feature_id is None:
|
724
|
+
link.feature_id = [
|
725
|
+
f.id
|
726
|
+
for f, l in registry_features_labels
|
727
|
+
if l.id == getattr(link, field_name)
|
728
|
+
][0]
|
729
|
+
link.save()
|
730
|
+
if _feature_values:
|
731
|
+
save(_feature_values)
|
732
|
+
if is_param:
|
733
|
+
LinkORM = self._host._param_values.through
|
734
|
+
valuefield_id = "paramvalue_id"
|
735
|
+
else:
|
736
|
+
LinkORM = self._host._feature_values.through
|
737
|
+
valuefield_id = "featurevalue_id"
|
738
|
+
links = [
|
739
|
+
LinkORM(
|
740
|
+
**{
|
741
|
+
f"{self._host.__class__.__get_name_with_schema__().lower()}_id": self._host.id,
|
742
|
+
valuefield_id: feature_value.id,
|
743
|
+
}
|
744
|
+
)
|
745
|
+
for feature_value in _feature_values
|
746
|
+
]
|
747
|
+
# a link might already exist, to avoid raising a unique constraint
|
748
|
+
# error, ignore_conflicts
|
749
|
+
save(links, ignore_conflicts=True)
|
750
|
+
|
751
|
+
|
752
|
+
def add_values_features(
|
753
|
+
self,
|
754
|
+
values: dict[str, str | int | float | bool],
|
755
|
+
feature_field: FieldAttr = Feature.name,
|
756
|
+
str_as_ulabel: bool = True,
|
757
|
+
) -> None:
|
758
|
+
"""Curate artifact with features & values.
|
759
|
+
|
760
|
+
Args:
|
761
|
+
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
762
|
+
feature_field: The field of a reference registry to map keys of the
|
763
|
+
dictionary.
|
764
|
+
str_as_ulabel: Whether to interpret string values as ulabels.
|
765
|
+
"""
|
766
|
+
_add_values(self, values, feature_field, str_as_ulabel=str_as_ulabel)
|
767
|
+
|
768
|
+
|
769
|
+
def add_values_params(
|
770
|
+
self,
|
771
|
+
values: dict[str, str | int | float | bool],
|
772
|
+
) -> None:
|
773
|
+
"""Curate artifact with features & values.
|
774
|
+
|
775
|
+
Args:
|
776
|
+
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
777
|
+
"""
|
778
|
+
_add_values(self, values, Param.name, str_as_ulabel=False)
|
779
|
+
|
780
|
+
|
781
|
+
def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
|
782
|
+
"""Curate artifact with a feature set.
|
783
|
+
|
784
|
+
Args:
|
785
|
+
feature_set: `FeatureSet` A feature set record.
|
786
|
+
slot: `str` The slot that marks where the feature set is stored in
|
787
|
+
the artifact.
|
788
|
+
"""
|
789
|
+
if self._host._state.adding:
|
790
|
+
raise ValueError(
|
791
|
+
"Please save the artifact or collection before adding a feature set!"
|
792
|
+
)
|
793
|
+
host_db = self._host._state.db
|
794
|
+
feature_set.save(using=host_db)
|
795
|
+
host_id_field = get_host_id_field(self._host)
|
796
|
+
kwargs = {
|
797
|
+
host_id_field: self._host.id,
|
798
|
+
"featureset": feature_set,
|
799
|
+
"slot": slot,
|
800
|
+
}
|
801
|
+
link_record = (
|
802
|
+
self._host.feature_sets.through.objects.using(host_db)
|
803
|
+
.filter(**kwargs)
|
804
|
+
.one_or_none()
|
805
|
+
)
|
806
|
+
if link_record is None:
|
807
|
+
self._host.feature_sets.through(**kwargs).save(using=host_db)
|
808
|
+
if slot in self._feature_set_by_slot:
|
809
|
+
logger.debug(f"replaced existing {slot} feature set")
|
810
|
+
self._feature_set_by_slot_[slot] = feature_set # type: ignore
|
811
|
+
|
812
|
+
|
813
|
+
def _add_set_from_df(
|
814
|
+
self, field: FieldAttr = Feature.name, organism: str | None = None
|
815
|
+
):
|
816
|
+
"""Add feature set corresponding to column names of DataFrame."""
|
817
|
+
if isinstance(self._host, Artifact):
|
818
|
+
assert self._host._accessor == "DataFrame" # noqa: S101
|
819
|
+
else:
|
820
|
+
# Collection
|
821
|
+
assert self._host.artifact._accessor == "DataFrame" # noqa: S101
|
822
|
+
|
823
|
+
# parse and register features
|
824
|
+
registry = field.field.model
|
825
|
+
df = self._host.load()
|
826
|
+
features = registry.from_values(df.columns, field=field, organism=organism)
|
827
|
+
if len(features) == 0:
|
828
|
+
logger.error(
|
829
|
+
"no validated features found in DataFrame! please register features first!"
|
830
|
+
)
|
831
|
+
return
|
832
|
+
|
833
|
+
# create and link feature sets
|
834
|
+
feature_set = FeatureSet(features=features)
|
835
|
+
feature_sets = {"columns": feature_set}
|
836
|
+
self._host._feature_sets = feature_sets
|
837
|
+
self._host.save()
|
838
|
+
|
839
|
+
|
840
|
+
def _add_set_from_anndata(
|
841
|
+
self,
|
842
|
+
var_field: FieldAttr,
|
843
|
+
obs_field: FieldAttr | None = Feature.name,
|
844
|
+
mute: bool = False,
|
845
|
+
organism: str | Record | None = None,
|
846
|
+
):
|
847
|
+
"""Add features from AnnData."""
|
848
|
+
if isinstance(self._host, Artifact):
|
849
|
+
assert self._host._accessor == "AnnData" # noqa: S101
|
850
|
+
else:
|
851
|
+
raise NotImplementedError()
|
852
|
+
|
853
|
+
# parse and register features
|
854
|
+
adata = self._host.load()
|
855
|
+
feature_sets = parse_feature_sets_from_anndata(
|
856
|
+
adata,
|
857
|
+
var_field=var_field,
|
858
|
+
obs_field=obs_field,
|
859
|
+
mute=mute,
|
860
|
+
organism=organism,
|
861
|
+
)
|
862
|
+
|
863
|
+
# link feature sets
|
864
|
+
self._host._feature_sets = feature_sets
|
865
|
+
self._host.save()
|
866
|
+
|
867
|
+
|
868
|
+
def _add_set_from_mudata(
|
869
|
+
self,
|
870
|
+
var_fields: dict[str, FieldAttr],
|
871
|
+
obs_fields: dict[str, FieldAttr] = None,
|
872
|
+
mute: bool = False,
|
873
|
+
organism: str | Record | None = None,
|
874
|
+
):
|
875
|
+
"""Add features from MuData."""
|
876
|
+
if obs_fields is None:
|
877
|
+
obs_fields = {}
|
878
|
+
if isinstance(self._host, Artifact):
|
879
|
+
assert self._host._accessor == "MuData" # noqa: S101
|
880
|
+
else:
|
881
|
+
raise NotImplementedError()
|
882
|
+
|
883
|
+
# parse and register features
|
884
|
+
mdata = self._host.load()
|
885
|
+
feature_sets = {}
|
886
|
+
obs_features = Feature.from_values(mdata.obs.columns)
|
887
|
+
if len(obs_features) > 0:
|
888
|
+
feature_sets["obs"] = FeatureSet(features=obs_features)
|
889
|
+
for modality, field in var_fields.items():
|
890
|
+
modality_fs = parse_feature_sets_from_anndata(
|
891
|
+
mdata[modality],
|
892
|
+
var_field=field,
|
893
|
+
obs_field=obs_fields.get(modality, Feature.name),
|
894
|
+
mute=mute,
|
895
|
+
organism=organism,
|
896
|
+
)
|
897
|
+
for k, v in modality_fs.items():
|
898
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
899
|
+
|
900
|
+
def unify_feature_sets_by_hash(feature_sets):
|
901
|
+
unique_values = {}
|
902
|
+
|
903
|
+
for key, value in feature_sets.items():
|
904
|
+
value_hash = value.hash # Assuming each value has a .hash attribute
|
905
|
+
if value_hash in unique_values:
|
906
|
+
feature_sets[key] = unique_values[value_hash]
|
907
|
+
else:
|
908
|
+
unique_values[value_hash] = value
|
909
|
+
|
910
|
+
return feature_sets
|
911
|
+
|
912
|
+
# link feature sets
|
913
|
+
self._host._feature_sets = unify_feature_sets_by_hash(feature_sets)
|
914
|
+
self._host.save()
|
915
|
+
|
916
|
+
|
917
|
+
def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
918
|
+
"""Transfer features from a artifact or collection."""
|
919
|
+
# This only covers feature sets
|
920
|
+
if transfer_logs is None:
|
921
|
+
transfer_logs = {"mapped": [], "transferred": [], "run": None}
|
922
|
+
using_key = settings._using_key
|
923
|
+
for slot, feature_set in data.features._feature_set_by_slot.items():
|
924
|
+
members = feature_set.members
|
925
|
+
if len(members) == 0:
|
926
|
+
continue
|
927
|
+
registry = members[0].__class__
|
928
|
+
# note here the features are transferred based on an unique field
|
929
|
+
field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
|
930
|
+
if hasattr(registry, "_ontology_id_field"):
|
931
|
+
field = registry._ontology_id_field
|
932
|
+
# this will be e.g. be a list of ontology_ids or uids
|
933
|
+
member_uids = list(members.values_list(field, flat=True))
|
934
|
+
# create records from ontology_id
|
935
|
+
if hasattr(registry, "_ontology_id_field") and len(member_uids) > 0:
|
936
|
+
# create from bionty
|
937
|
+
members_records = registry.from_values(member_uids, field=field)
|
938
|
+
save([r for r in members_records if r._state.adding])
|
939
|
+
validated = registry.validate(member_uids, field=field, mute=True)
|
940
|
+
new_members_uids = list(compress(member_uids, ~validated))
|
941
|
+
new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
|
942
|
+
n_new_members = len(new_members)
|
943
|
+
if n_new_members > 0:
|
944
|
+
# transfer foreign keys needs to be run before transfer to default db
|
945
|
+
transfer_fk_to_default_db_bulk(
|
946
|
+
new_members, using_key, transfer_logs=transfer_logs
|
947
|
+
)
|
948
|
+
for feature in new_members:
|
949
|
+
# not calling save=True here as in labels, because want to
|
950
|
+
# bulk save below
|
951
|
+
# transfer_fk is set to False because they are already transferred
|
952
|
+
# in the previous step transfer_fk_to_default_db_bulk
|
953
|
+
transfer_to_default_db(
|
954
|
+
feature, using_key, transfer_fk=False, transfer_logs=transfer_logs
|
955
|
+
)
|
956
|
+
logger.info(f"saving {n_new_members} new {registry.__name__} records")
|
957
|
+
save(new_members)
|
958
|
+
|
959
|
+
# create a new feature set from feature values using the same uid
|
960
|
+
feature_set_self = FeatureSet.from_values(
|
961
|
+
member_uids, field=getattr(registry, field)
|
962
|
+
)
|
963
|
+
if feature_set_self is None:
|
964
|
+
if hasattr(registry, "organism_id"):
|
965
|
+
logger.warning(
|
966
|
+
f"FeatureSet is not transferred, check if organism is set correctly: {feature_set}"
|
967
|
+
)
|
968
|
+
continue
|
969
|
+
# make sure the uid matches if featureset is composed of same features
|
970
|
+
if feature_set_self.hash == feature_set.hash:
|
971
|
+
feature_set_self.uid = feature_set.uid
|
972
|
+
logger.info(f"saving {slot} featureset: {feature_set_self}")
|
973
|
+
self._host.features.add_feature_set(feature_set_self, slot)
|
974
|
+
|
975
|
+
|
976
|
+
FeatureManager.__init__ = __init__
|
977
|
+
ParamManager.__init__ = __init__
|
978
|
+
FeatureManager.__repr__ = __repr__
|
979
|
+
ParamManager.__repr__ = __repr__
|
980
|
+
FeatureManager.__getitem__ = __getitem__
|
981
|
+
FeatureManager.get_values = get_values
|
982
|
+
FeatureManager._feature_set_by_slot = _feature_set_by_slot
|
983
|
+
FeatureManager._accessor_by_registry = _accessor_by_registry
|
984
|
+
FeatureManager.add_values = add_values_features
|
985
|
+
FeatureManager.add_feature_set = add_feature_set
|
986
|
+
FeatureManager._add_set_from_df = _add_set_from_df
|
987
|
+
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
988
|
+
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
989
|
+
FeatureManager._add_from = _add_from
|
990
|
+
FeatureManager.filter = filter
|
991
|
+
FeatureManager.get = get
|
992
|
+
ParamManager.add_values = add_values_params
|
993
|
+
ParamManager.get_values = get_values
|
994
|
+
ParamManager.filter = filter
|