repare 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of repare might be problematic. Click here for more details.
- repare/main.py +9 -3
- repare/pedigree.py +491 -242
- repare/pedigree_reconstructor.py +213 -43
- repare-0.1.1.dist-info/METADATA +121 -0
- repare-0.1.1.dist-info/RECORD +10 -0
- {repare-0.0.2.dist-info → repare-0.1.1.dist-info}/WHEEL +1 -1
- repare-0.0.2.dist-info/METADATA +0 -35
- repare-0.0.2.dist-info/RECORD +0 -10
- {repare-0.0.2.dist-info → repare-0.1.1.dist-info}/entry_points.txt +0 -0
- {repare-0.0.2.dist-info → repare-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {repare-0.0.2.dist-info → repare-0.1.1.dist-info}/top_level.txt +0 -0
repare/pedigree_reconstructor.py
CHANGED
|
@@ -47,12 +47,13 @@ class PedigreeReconstructor:
|
|
|
47
47
|
self._write_alternate_pedigrees = write_alternate_pedigrees
|
|
48
48
|
self._random_seed = random_seed
|
|
49
49
|
random.seed(self._random_seed)
|
|
50
|
+
self._validate_arguments()
|
|
50
51
|
|
|
51
52
|
# Maximum number of times to run the algorithm if no valid pedigree is found
|
|
52
53
|
self._MAX_RUNS = 10
|
|
53
54
|
self._candidate_pedigrees: list[Pedigree] = [self._get_initial_pedigree()]
|
|
54
55
|
self._pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = self._get_pair_to_constraints()
|
|
55
|
-
self.
|
|
56
|
+
self._final_pedigrees: list[Pedigree] = []
|
|
56
57
|
|
|
57
58
|
def _validate_node_data(self, nodes_path: str) -> None:
|
|
58
59
|
"""
|
|
@@ -71,6 +72,12 @@ class PedigreeReconstructor:
|
|
|
71
72
|
if self._node_data["id"].str.isnumeric().any():
|
|
72
73
|
raise ValueError("Sample IDs cannot be completely numeric.")
|
|
73
74
|
|
|
75
|
+
if self._node_data["id"].duplicated().any():
|
|
76
|
+
raise ValueError("Sample IDs must be unique.")
|
|
77
|
+
|
|
78
|
+
if self._node_data["id"].str.strip().eq("").any():
|
|
79
|
+
raise ValueError("Sample IDs cannot be empty.")
|
|
80
|
+
|
|
74
81
|
if not self._node_data["sex"].isin(["M", "F"]).all():
|
|
75
82
|
raise ValueError('Node sex must be "M" or "F".')
|
|
76
83
|
|
|
@@ -275,6 +282,15 @@ class PedigreeReconstructor:
|
|
|
275
282
|
[self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
|
|
276
283
|
).reset_index(drop=True)
|
|
277
284
|
|
|
285
|
+
def _validate_arguments(self) -> None:
|
|
286
|
+
"""
|
|
287
|
+
Validate constructor arguments.
|
|
288
|
+
"""
|
|
289
|
+
if not isinstance(self._max_candidate_pedigrees, int) or self._max_candidate_pedigrees <= 0:
|
|
290
|
+
raise ValueError("max_candidate_pedigrees must be a positive integer.")
|
|
291
|
+
if not (0 <= self._epsilon <= 1):
|
|
292
|
+
raise ValueError("epsilon must be between 0 and 1.")
|
|
293
|
+
|
|
278
294
|
def _shuffle_relations(self) -> None:
|
|
279
295
|
"""
|
|
280
296
|
Shuffle relation DataFrames (when we want to restart the algorithm).
|
|
@@ -333,11 +349,14 @@ class PedigreeReconstructor:
|
|
|
333
349
|
self._add_relation(
|
|
334
350
|
node1, node2, degree=degree, constraints=constraints, force_constraints=force_constraints
|
|
335
351
|
)
|
|
336
|
-
self.
|
|
352
|
+
self._clean_pedigree_data()
|
|
353
|
+
self._validate_pedigree_structures()
|
|
337
354
|
|
|
338
355
|
processed_relations = self._all_relations.iloc[: idx + 1]
|
|
339
356
|
pair_to_relations_so_far = self._get_pair_to_relations_so_far(processed_relations)
|
|
340
357
|
if degree == "1" and len(processed_relations) < len(self._first_and_second_degree_relations):
|
|
358
|
+
# Don't check for extraneous half-sibling relations because
|
|
359
|
+
# the 2 non-shared parents might be "merged" later
|
|
341
360
|
self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=False)
|
|
342
361
|
else:
|
|
343
362
|
self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=True)
|
|
@@ -358,6 +377,7 @@ class PedigreeReconstructor:
|
|
|
358
377
|
logger.error(f"No valid pedigree found after {self._MAX_RUNS} runs. Exiting.")
|
|
359
378
|
raise RuntimeError(f"No valid pedigree found after {self._MAX_RUNS} runs.")
|
|
360
379
|
|
|
380
|
+
self._clean_pedigree_data()
|
|
361
381
|
# Plot and write outputs of sample pedigree
|
|
362
382
|
sample_idx = random.randint(0, len(self._final_pedigrees) - 1)
|
|
363
383
|
self._sample_pedigree = self._final_pedigrees[sample_idx]
|
|
@@ -373,7 +393,7 @@ class PedigreeReconstructor:
|
|
|
373
393
|
)
|
|
374
394
|
if self._plot:
|
|
375
395
|
try:
|
|
376
|
-
self._sample_pedigree.plot(os.path.join(self._outputs_dir, "reconstructed_pedigree.
|
|
396
|
+
self._sample_pedigree.plot(os.path.join(self._outputs_dir, "reconstructed_pedigree.pdf"))
|
|
377
397
|
pygraphviz_found = True
|
|
378
398
|
except ImportError:
|
|
379
399
|
logger.warning(
|
|
@@ -386,7 +406,7 @@ class PedigreeReconstructor:
|
|
|
386
406
|
if self._write_alternate_pedigrees:
|
|
387
407
|
os.makedirs(os.path.join(self._outputs_dir, "alternate_pedigrees"), exist_ok=True)
|
|
388
408
|
for idx, (pedigree, strike_count, strike_log) in enumerate(
|
|
389
|
-
zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs)
|
|
409
|
+
zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs, strict=True)
|
|
390
410
|
):
|
|
391
411
|
self._write_corrected_input_relations(
|
|
392
412
|
strike_count,
|
|
@@ -402,6 +422,119 @@ class PedigreeReconstructor:
|
|
|
402
422
|
pedigree.plot(os.path.join(self._outputs_dir, "alternate_pedigrees", f"pedigree_{idx}.png"))
|
|
403
423
|
return self._sample_pedigree
|
|
404
424
|
|
|
425
|
+
@staticmethod
|
|
426
|
+
def _check_haplogroups(haplogroup1: str, haplogroup2: str) -> bool:
|
|
427
|
+
"""
|
|
428
|
+
Checks if two haplogroups are compatible. Same semantics as pedigree.validate_haplogroups().
|
|
429
|
+
"*" is wild card character.
|
|
430
|
+
"""
|
|
431
|
+
if not haplogroup1 or not haplogroup2: # empty OK
|
|
432
|
+
return True
|
|
433
|
+
haplogroup1_stripped, haplogroup2_stripped = haplogroup1.rstrip("*"), haplogroup2.rstrip("*")
|
|
434
|
+
return haplogroup1_stripped.startswith(haplogroup2_stripped) or haplogroup2_stripped.startswith(
|
|
435
|
+
haplogroup1_stripped
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
@staticmethod
|
|
439
|
+
def _check_parent_child_haplogroups(pedigree: Pedigree, parent: str, child: str) -> bool:
|
|
440
|
+
"""
|
|
441
|
+
Checks if the haplogroups of a parent and child are compatible.
|
|
442
|
+
"""
|
|
443
|
+
if pedigree.get_data(parent)["sex"] == "M" and pedigree.get_data(child)["sex"] == "M":
|
|
444
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
445
|
+
pedigree.get_data(parent)["y_haplogroup"], pedigree.get_data(child)["y_haplogroup"]
|
|
446
|
+
)
|
|
447
|
+
if pedigree.get_data(parent)["sex"] == "F":
|
|
448
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
449
|
+
pedigree.get_data(parent)["mt_haplogroup"], pedigree.get_data(child)["mt_haplogroup"]
|
|
450
|
+
)
|
|
451
|
+
return True
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def _check_sibling_haplogroups(pedigree: Pedigree, sibling1: str, sibling2: str) -> bool:
|
|
455
|
+
"""
|
|
456
|
+
Checks if the haplogroups of two full siblings are compatible.
|
|
457
|
+
"""
|
|
458
|
+
if pedigree.get_data(sibling1)["sex"] == "M" and pedigree.get_data(sibling2)["sex"] == "M":
|
|
459
|
+
# MT haplogroups still need to agree as well
|
|
460
|
+
if not PedigreeReconstructor._check_haplogroups(
|
|
461
|
+
pedigree.get_data(sibling1)["y_haplogroup"], pedigree.get_data(sibling2)["y_haplogroup"]
|
|
462
|
+
):
|
|
463
|
+
return False
|
|
464
|
+
# All full siblings should share MT haplogroups
|
|
465
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
466
|
+
pedigree.get_data(sibling1)["mt_haplogroup"], pedigree.get_data(sibling2)["mt_haplogroup"]
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
@staticmethod
|
|
470
|
+
def _check_aunt_uncle_nephew_niece_haplogroups(
|
|
471
|
+
pedigree: Pedigree, aunt_uncle: str, nephew_niece: str, shared_relative_sex: str | None
|
|
472
|
+
) -> bool:
|
|
473
|
+
"""
|
|
474
|
+
Checks if the haplogroups of an aunt/uncle and nephew/niece are compatible.
|
|
475
|
+
"""
|
|
476
|
+
if not shared_relative_sex:
|
|
477
|
+
return True
|
|
478
|
+
|
|
479
|
+
if (
|
|
480
|
+
shared_relative_sex == "M"
|
|
481
|
+
and pedigree.get_data(aunt_uncle)["sex"] == "M"
|
|
482
|
+
and pedigree.get_data(nephew_niece)["sex"] == "M"
|
|
483
|
+
):
|
|
484
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
485
|
+
pedigree.get_data(aunt_uncle)["y_haplogroup"], pedigree.get_data(nephew_niece)["y_haplogroup"]
|
|
486
|
+
)
|
|
487
|
+
if shared_relative_sex == "F":
|
|
488
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
489
|
+
pedigree.get_data(aunt_uncle)["mt_haplogroup"], pedigree.get_data(nephew_niece)["mt_haplogroup"]
|
|
490
|
+
)
|
|
491
|
+
return True
|
|
492
|
+
|
|
493
|
+
@staticmethod
|
|
494
|
+
def _check_grandparent_grandchild_haplogroups(
|
|
495
|
+
pedigree: Pedigree, grandparent: str, grandchild: str, shared_relative_sex: str | None
|
|
496
|
+
) -> bool:
|
|
497
|
+
"""
|
|
498
|
+
Checks if the haplogroups of a grandparent and grandchild are compatible.
|
|
499
|
+
"""
|
|
500
|
+
if not shared_relative_sex:
|
|
501
|
+
return True
|
|
502
|
+
|
|
503
|
+
if (
|
|
504
|
+
shared_relative_sex == "M"
|
|
505
|
+
and pedigree.get_data(grandparent)["sex"] == "M"
|
|
506
|
+
and pedigree.get_data(grandchild)["sex"] == "M"
|
|
507
|
+
):
|
|
508
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
509
|
+
pedigree.get_data(grandparent)["y_haplogroup"], pedigree.get_data(grandchild)["y_haplogroup"]
|
|
510
|
+
)
|
|
511
|
+
if shared_relative_sex == "F" and pedigree.get_data(grandparent)["sex"] == "F":
|
|
512
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
513
|
+
pedigree.get_data(grandparent)["mt_haplogroup"], pedigree.get_data(grandchild)["mt_haplogroup"]
|
|
514
|
+
)
|
|
515
|
+
return True
|
|
516
|
+
|
|
517
|
+
@staticmethod
|
|
518
|
+
def _check_half_sibling_haplogroups(
|
|
519
|
+
pedigree: Pedigree, half_sibling1: str, half_sibling2: str, shared_relative_sex: str | None
|
|
520
|
+
) -> bool:
|
|
521
|
+
"""
|
|
522
|
+
Checks if the haplogroups of two half-siblings are compatible.
|
|
523
|
+
"""
|
|
524
|
+
if (
|
|
525
|
+
shared_relative_sex == "M"
|
|
526
|
+
and pedigree.get_data(half_sibling1)["sex"] == "M"
|
|
527
|
+
and pedigree.get_data(half_sibling2)["sex"] == "M"
|
|
528
|
+
):
|
|
529
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
530
|
+
pedigree.get_data(half_sibling1)["y_haplogroup"], pedigree.get_data(half_sibling2)["y_haplogroup"]
|
|
531
|
+
)
|
|
532
|
+
if shared_relative_sex == "F":
|
|
533
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
534
|
+
pedigree.get_data(half_sibling1)["mt_haplogroup"], pedigree.get_data(half_sibling2)["mt_haplogroup"]
|
|
535
|
+
)
|
|
536
|
+
return True
|
|
537
|
+
|
|
405
538
|
def _add_relation(self, node1: str, node2: str, degree: str, constraints: str, force_constraints: bool) -> None:
|
|
406
539
|
"""
|
|
407
540
|
Connects two nodes in every pedigree.
|
|
@@ -451,7 +584,7 @@ class PedigreeReconstructor:
|
|
|
451
584
|
self._candidate_pedigrees = new_pedigrees
|
|
452
585
|
|
|
453
586
|
@staticmethod
|
|
454
|
-
def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) ->
|
|
587
|
+
def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
|
|
455
588
|
"""
|
|
456
589
|
Update pedigree with a first-degree relation.
|
|
457
590
|
"""
|
|
@@ -470,7 +603,7 @@ class PedigreeReconstructor:
|
|
|
470
603
|
return new_pedigrees
|
|
471
604
|
|
|
472
605
|
@staticmethod
|
|
473
|
-
def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) ->
|
|
606
|
+
def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
|
|
474
607
|
"""
|
|
475
608
|
Update pedigree with a second-degree relation.
|
|
476
609
|
"""
|
|
@@ -536,16 +669,20 @@ class PedigreeReconstructor:
|
|
|
536
669
|
"""
|
|
537
670
|
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
538
671
|
|
|
672
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
673
|
+
if not PedigreeReconstructor._check_parent_child_haplogroups(pedigree, node1, node2):
|
|
674
|
+
return []
|
|
675
|
+
|
|
539
676
|
ret: list[Pedigree] = []
|
|
540
677
|
new_pedigree = copy.deepcopy(pedigree)
|
|
541
678
|
new_pedigree.fill_node_parents(node2)
|
|
542
679
|
original_parent: str
|
|
543
|
-
if new_pedigree.
|
|
544
|
-
original_parent = new_pedigree.
|
|
680
|
+
if new_pedigree.get_data(node1)["sex"] == "M":
|
|
681
|
+
original_parent = new_pedigree.get_father(node2)
|
|
545
682
|
else:
|
|
546
|
-
original_parent = new_pedigree.
|
|
683
|
+
original_parent = new_pedigree.get_mother(node2)
|
|
547
684
|
|
|
548
|
-
if
|
|
685
|
+
if new_pedigree.check_valid_merge(node1, original_parent):
|
|
549
686
|
new_pedigree.merge_nodes(node1, original_parent)
|
|
550
687
|
ret.append(new_pedigree)
|
|
551
688
|
return ret
|
|
@@ -558,18 +695,22 @@ class PedigreeReconstructor:
|
|
|
558
695
|
"""
|
|
559
696
|
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
560
697
|
|
|
698
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
699
|
+
if not PedigreeReconstructor._check_sibling_haplogroups(pedigree, node1, node2):
|
|
700
|
+
return []
|
|
701
|
+
|
|
561
702
|
ret: list[Pedigree] = []
|
|
562
703
|
new_pedigree = copy.deepcopy(pedigree)
|
|
563
704
|
new_pedigree.fill_node_parents(node1)
|
|
564
705
|
new_pedigree.fill_node_parents(node2)
|
|
565
706
|
|
|
566
|
-
father1 = new_pedigree.
|
|
567
|
-
father2 = new_pedigree.
|
|
568
|
-
if
|
|
707
|
+
father1 = new_pedigree.get_father(node1)
|
|
708
|
+
father2 = new_pedigree.get_father(node2)
|
|
709
|
+
if new_pedigree.check_valid_merge(father1, father2):
|
|
569
710
|
new_pedigree.merge_nodes(father1, father2)
|
|
570
|
-
mother1 = new_pedigree.
|
|
571
|
-
mother2 = new_pedigree.
|
|
572
|
-
if
|
|
711
|
+
mother1 = new_pedigree.get_mother(node1)
|
|
712
|
+
mother2 = new_pedigree.get_mother(node2)
|
|
713
|
+
if new_pedigree.check_valid_merge(mother1, mother2):
|
|
573
714
|
new_pedigree.merge_nodes(mother1, mother2)
|
|
574
715
|
new_pedigree.add_sibling_relation(node1, node2)
|
|
575
716
|
ret.append(new_pedigree)
|
|
@@ -586,17 +727,23 @@ class PedigreeReconstructor:
|
|
|
586
727
|
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
587
728
|
assert shared_relative_sex in ["M", "F", None]
|
|
588
729
|
|
|
730
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
731
|
+
if not PedigreeReconstructor._check_aunt_uncle_nephew_niece_haplogroups(
|
|
732
|
+
pedigree, node1, node2, shared_relative_sex
|
|
733
|
+
):
|
|
734
|
+
return []
|
|
735
|
+
|
|
589
736
|
ret: list[Pedigree] = []
|
|
590
737
|
new_pedigree = copy.deepcopy(pedigree)
|
|
591
738
|
new_pedigree.fill_node_parents(node2)
|
|
592
739
|
|
|
593
740
|
node2_parents: list[str]
|
|
594
741
|
if shared_relative_sex == "M":
|
|
595
|
-
node2_parents = [new_pedigree.
|
|
742
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
596
743
|
elif shared_relative_sex == "F":
|
|
597
|
-
node2_parents = [new_pedigree.
|
|
744
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
598
745
|
else:
|
|
599
|
-
node2_parents = [new_pedigree.
|
|
746
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
600
747
|
|
|
601
748
|
for node2_parent in node2_parents:
|
|
602
749
|
if node1 != node2_parent:
|
|
@@ -614,17 +761,23 @@ class PedigreeReconstructor:
|
|
|
614
761
|
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
615
762
|
assert shared_relative_sex in ["M", "F", None]
|
|
616
763
|
|
|
764
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
765
|
+
if not PedigreeReconstructor._check_grandparent_grandchild_haplogroups(
|
|
766
|
+
pedigree, node1, node2, shared_relative_sex
|
|
767
|
+
):
|
|
768
|
+
return []
|
|
769
|
+
|
|
617
770
|
ret: list[Pedigree] = []
|
|
618
771
|
new_pedigree = copy.deepcopy(pedigree)
|
|
619
772
|
new_pedigree.fill_node_parents(node2)
|
|
620
773
|
|
|
621
774
|
node2_parents: list[str]
|
|
622
775
|
if shared_relative_sex == "M":
|
|
623
|
-
node2_parents = [new_pedigree.
|
|
776
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
624
777
|
elif shared_relative_sex == "F":
|
|
625
|
-
node2_parents = [new_pedigree.
|
|
778
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
626
779
|
else:
|
|
627
|
-
node2_parents = [new_pedigree.
|
|
780
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
628
781
|
|
|
629
782
|
for node2_parent in node2_parents:
|
|
630
783
|
if node1 != node2_parent:
|
|
@@ -641,6 +794,10 @@ class PedigreeReconstructor:
|
|
|
641
794
|
"""
|
|
642
795
|
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
643
796
|
|
|
797
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
798
|
+
if not PedigreeReconstructor._check_half_sibling_haplogroups(pedigree, node1, node2, shared_relative_sex):
|
|
799
|
+
return []
|
|
800
|
+
|
|
644
801
|
ret: list[Pedigree] = []
|
|
645
802
|
new_pedigree = copy.deepcopy(pedigree)
|
|
646
803
|
new_pedigree.fill_node_parents(node1)
|
|
@@ -649,14 +806,14 @@ class PedigreeReconstructor:
|
|
|
649
806
|
node1_parents: list[str]
|
|
650
807
|
node2_parents: list[str]
|
|
651
808
|
if shared_relative_sex == "M":
|
|
652
|
-
node1_parents = [new_pedigree.
|
|
653
|
-
node2_parents = [new_pedigree.
|
|
809
|
+
node1_parents = [new_pedigree.get_father(node1)]
|
|
810
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
654
811
|
elif shared_relative_sex == "F":
|
|
655
|
-
node1_parents = [new_pedigree.
|
|
656
|
-
node2_parents = [new_pedigree.
|
|
812
|
+
node1_parents = [new_pedigree.get_mother(node1)]
|
|
813
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
657
814
|
else:
|
|
658
|
-
node1_parents = [new_pedigree.
|
|
659
|
-
node2_parents = [new_pedigree.
|
|
815
|
+
node1_parents = [new_pedigree.get_father(node1), new_pedigree.get_mother(node1)]
|
|
816
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
660
817
|
|
|
661
818
|
# Node 1 and Node 2 are half-siblings via one of Node 1's parents
|
|
662
819
|
for node1_parent in node1_parents:
|
|
@@ -668,12 +825,22 @@ class PedigreeReconstructor:
|
|
|
668
825
|
ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node2_parent, node1))
|
|
669
826
|
return ret
|
|
670
827
|
|
|
671
|
-
def
|
|
828
|
+
def _clean_pedigree_data(self) -> None:
|
|
672
829
|
"""
|
|
673
830
|
Remove unnecessary entries in Pedigree dicts.
|
|
674
831
|
"""
|
|
675
832
|
for pedigree in self._candidate_pedigrees:
|
|
676
|
-
pedigree.
|
|
833
|
+
pedigree.clean_data()
|
|
834
|
+
|
|
835
|
+
for pedigree in self._final_pedigrees:
|
|
836
|
+
pedigree.clean_data()
|
|
837
|
+
|
|
838
|
+
def _validate_pedigree_structures(self) -> None:
|
|
839
|
+
"""
|
|
840
|
+
Validate that all candidate pedigrees are consistent.
|
|
841
|
+
"""
|
|
842
|
+
for pedigree in self._candidate_pedigrees:
|
|
843
|
+
assert pedigree.validate_structure()
|
|
677
844
|
|
|
678
845
|
def _get_pair_to_constraints(self) -> defaultdict[tuple[str, str], list[tuple[str, ...]]]:
|
|
679
846
|
"""
|
|
@@ -730,12 +897,12 @@ class PedigreeReconstructor:
|
|
|
730
897
|
|
|
731
898
|
strikes = []
|
|
732
899
|
third_degree_strikes = []
|
|
733
|
-
counts = defaultdict(int)
|
|
900
|
+
counts: defaultdict[int, int] = defaultdict(int)
|
|
734
901
|
for pedigree in new_potential_pedigrees:
|
|
735
902
|
num_strikes, _ = pedigree.count_inconsistencies(
|
|
736
903
|
self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings
|
|
737
904
|
)
|
|
738
|
-
num_third_degree_strikes = pedigree.
|
|
905
|
+
num_third_degree_strikes = pedigree.count_third_degree_inconsistencies(self._pair_to_constraints)
|
|
739
906
|
strikes.append(num_strikes)
|
|
740
907
|
third_degree_strikes.append(num_third_degree_strikes)
|
|
741
908
|
counts[num_strikes] += 1
|
|
@@ -754,7 +921,9 @@ class PedigreeReconstructor:
|
|
|
754
921
|
|
|
755
922
|
sorted_pedigrees = [
|
|
756
923
|
pedigree
|
|
757
|
-
for pedigree, _, _ in sorted(
|
|
924
|
+
for pedigree, _, _ in sorted(
|
|
925
|
+
zip(pedigrees, strikes, third_degree_strikes, strict=True), key=lambda x: (x[1], x[2])
|
|
926
|
+
)
|
|
758
927
|
]
|
|
759
928
|
exploitation_max_candidate_pedigrees = int((1 - epsilon) * max_candidate_pedigrees)
|
|
760
929
|
exploration_max_candidate_pedigrees = max_candidate_pedigrees - exploitation_max_candidate_pedigrees
|
|
@@ -778,19 +947,21 @@ class PedigreeReconstructor:
|
|
|
778
947
|
# Final iteration
|
|
779
948
|
best_pedigrees = [
|
|
780
949
|
pedigree
|
|
781
|
-
for pedigree, num_strikes in zip(new_potential_pedigrees, strikes)
|
|
950
|
+
for pedigree, num_strikes in zip(new_potential_pedigrees, strikes, strict=True)
|
|
782
951
|
if num_strikes == min(strikes)
|
|
783
952
|
]
|
|
784
953
|
# Use 3rd-degree strikes as tiebreaker
|
|
785
954
|
third_degree_strikes = [
|
|
786
|
-
pedigree.
|
|
955
|
+
pedigree.count_third_degree_inconsistencies(self._pair_to_constraints) for pedigree in best_pedigrees
|
|
787
956
|
]
|
|
788
957
|
|
|
789
|
-
self._final_pedigrees
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
958
|
+
self._final_pedigrees.extend(
|
|
959
|
+
[
|
|
960
|
+
pedigree
|
|
961
|
+
for pedigree, num_strikes in zip(best_pedigrees, third_degree_strikes, strict=True)
|
|
962
|
+
if num_strikes == min(third_degree_strikes)
|
|
963
|
+
]
|
|
964
|
+
)
|
|
794
965
|
self._final_strike_counts = []
|
|
795
966
|
self._final_strike_logs = []
|
|
796
967
|
for pedigree in self._final_pedigrees:
|
|
@@ -799,10 +970,9 @@ class PedigreeReconstructor:
|
|
|
799
970
|
)
|
|
800
971
|
self._final_strike_counts.append(strike_count)
|
|
801
972
|
self._final_strike_logs.append(strike_log)
|
|
802
|
-
pedigree.clean_up_relations()
|
|
803
973
|
|
|
804
974
|
def _write_corrected_input_relations(
|
|
805
|
-
self, strike_count: int, strike_log: list[tuple[str, str, str]], path: str
|
|
975
|
+
self, strike_count: int, strike_log: list[tuple[str, str, str, str]], path: str
|
|
806
976
|
) -> None:
|
|
807
977
|
"""
|
|
808
978
|
Write corrected input relations to file. Includes information about added/removed/changed input relations.
|
|
@@ -827,7 +997,7 @@ class PedigreeReconstructor:
|
|
|
827
997
|
|
|
828
998
|
with open(path, "w") as file:
|
|
829
999
|
file.write("id1,id2,degree,constraints\n") # Header line
|
|
830
|
-
file.write(f"# Final
|
|
1000
|
+
file.write(f"# Final inconsistency count: {strike_count}\n")
|
|
831
1001
|
|
|
832
1002
|
def write_relations_line(node1, node2, degree, constraints, commented=False):
|
|
833
1003
|
if constraints == self._DEFAULT_CONSTRAINTS[degree]:
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: repare
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Reconstruct (ancient) pedigrees from pairwise kinship relations.
|
|
5
|
+
Author-email: Edward Huang <edwardhuangc@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Source, https://github.com/ehuangc/repare
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: matplotlib
|
|
12
|
+
Requires-Dist: networkx
|
|
13
|
+
Requires-Dist: pandas
|
|
14
|
+
Requires-Dist: tqdm
|
|
15
|
+
Provides-Extra: benchmark
|
|
16
|
+
Requires-Dist: scikit-learn; extra == "benchmark"
|
|
17
|
+
Requires-Dist: seaborn; extra == "benchmark"
|
|
18
|
+
Provides-Extra: plot
|
|
19
|
+
Requires-Dist: pygraphviz; extra == "plot"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
🌲 **repare** is a Python package for (ancient) pedigree reconstruction.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
### Recommended
|
|
27
|
+
```
|
|
28
|
+
conda create -n "repare" -c conda-forge python=3.13 pygraphviz matplotlib networkx pandas tqdm
|
|
29
|
+
conda activate repare
|
|
30
|
+
pip install repare
|
|
31
|
+
```
|
|
32
|
+
repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment, as shown above.
|
|
33
|
+
|
|
34
|
+
If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
|
|
35
|
+
|
|
36
|
+
To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
We recommend running repare through its command-line interface.
|
|
42
|
+
```
|
|
43
|
+
repare -n NODES -r RELATIONS [-o OUTPUT] [-m MAX_CANDIDATE_PEDIGREES] [-e EPSILON] [-s SEED] [-d] [-w] [-v]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
> [!NOTE]
|
|
47
|
+
> Minimal command:
|
|
48
|
+
> ```
|
|
49
|
+
> repare -n nodes.csv -r relations.csv
|
|
50
|
+
> ```
|
|
51
|
+
> For example data inputs, see [examples/nodes.csv](examples/nodes.csv) and [examples/relations.csv](examples/relations.csv).
|
|
52
|
+
|
|
53
|
+
### Inputs
|
|
54
|
+
**Nodes** (-n) (*required*): Path to a CSV file that contains information about the individuals to be analyzed by repare.
|
|
55
|
+
|
|
56
|
+
<dl>
|
|
57
|
+
<dd>
|
|
58
|
+
<details open>
|
|
59
|
+
<summary><ins>Nodes CSV file columns</ins></summary>
|
|
60
|
+
|
|
61
|
+
- **id** *(required)*: ID of individual. Cannot be fully numeric, as numeric IDs are reserved for placeholder nodes.
|
|
62
|
+
- **sex** *(required)*: Genetic sex of individual. Value must be "M" or "F".
|
|
63
|
+
- **y_haplogroup** *(required)*: Y chromosome haplogroup of individual. Can include "*" as a wildcard expansion character at the end if haplogroup is not fully inferred.
|
|
64
|
+
- **mt_haplogroup** *(required)*: Mitochondrial haplogroup of individual. Can include "*" as a wildcard expansion character at the end if haplogroup is not fully inferred.
|
|
65
|
+
- **can_have_children** *(optional)*: Whether the individual *can* have offspring (e.g., as indicated by age of death). If provided, value must be "True" or "False". Defaults to "True".
|
|
66
|
+
- **can_be_inbred** *(optional)*: Whether the individual *can* have parents related at the 3rd-degree or closer (e.g., as indicated by ROH). If provided, value must be "True" or "False". Defaults to "True".
|
|
67
|
+
- **years_before_present** *(optional)*: (Approximate) date of birth of individual, in years before present. If provided, will be used to prune temporally invalid pedigrees. *This column should only be used when backed by strong dating evidence.*
|
|
68
|
+
</details>
|
|
69
|
+
</dd>
|
|
70
|
+
</dl>
|
|
71
|
+
|
|
72
|
+
**Relations** (-r) (*required*): Path to a CSV file that contains information about inferred pairwise kinship relations. Methods to infer these kinship relations include [KIN](https://doi.org/10.1186/s13059-023-02847-7) and [READv2](https://doi.org/10.1186/s13059-024-03350-3). All individuals included in this file must be specified in the nodes CSV.
|
|
73
|
+
|
|
74
|
+
<dl>
|
|
75
|
+
<dd>
|
|
76
|
+
<details open>
|
|
77
|
+
<summary><ins>Relations CSV file columns</ins></summary>
|
|
78
|
+
|
|
79
|
+
- **id1** *(required)*: ID of individual 1.
|
|
80
|
+
- **id2** *(required)*: ID of individual 2.
|
|
81
|
+
- **degree** *(required)*: Degree of (inferred) kinship relation between individual 1 and individual 2. Value must be "1", "2", or "3". Higher-degree relatives are considered unrelated.
|
|
82
|
+
- **constraints** *(optional)*: Semicolon-delimited list of possible configurations of kinship relation. For example, a parental 1st-degree relation can be constrained with "parent-child;child-parent". Many kinship inference methods will classify 1st-degree relation types, which can be used as relation constraints. Valid constraints: "parent-child", "child-parent", "siblings", "maternal aunt/uncle-nephew/niece", "maternal nephew/niece-aunt/uncle", "paternal aunt/uncle-nephew/niece", "paternal nephew/niece-aunt/uncle", "maternal grandparent-grandchild", "maternal grandchild-grandparent", "paternal grandparent-grandchild", "paternal grandchild-grandparent" "maternal half-siblings", "paternal half-siblings".
|
|
83
|
+
- **force_constraints** *(optional)*: Whether the corresponding constraint should be forced. If provided, value must be "True" or "False". If "True", the constraint must be followed. If "False", breaking the constraint counts as one inconsistency. Defaults to "False".
|
|
84
|
+
</details>
|
|
85
|
+
</dd>
|
|
86
|
+
</dl>
|
|
87
|
+
|
|
88
|
+
**Output** (-o) (*optional*): Path to directory for saving repare outputs. Defaults to the current working directory.
|
|
89
|
+
|
|
90
|
+
**Max Candidate Pedigrees** (-m) (*optional*): Maximum number of candidate pedigrees to keep after each algorithm iteration. Defaults to 1000.
|
|
91
|
+
|
|
92
|
+
**Epsilon** (-e) (*optional*): Parameter for adapted epsilon-greedy sampling at the end of each algorithm iteration. Defaults to 0.2.
|
|
93
|
+
|
|
94
|
+
**Seed** (-s) (*optional*): Random seed for reproducibility. Defaults to 42.
|
|
95
|
+
|
|
96
|
+
**Do Not Plot** (-d) (*flag*): If set, do not plot reconstructed pedigree(s).
|
|
97
|
+
|
|
98
|
+
**Write Alternate Pedigrees** (-w) (*flag*): If set, write outputs for alternate reconstructed pedigrees to disk.
|
|
99
|
+
|
|
100
|
+
**Verbose** (-v) (*flag*): If set, enable verbose output (INFO-level logging).
|
|
101
|
+
|
|
102
|
+
<p align="center">
|
|
103
|
+
<img src="https://raw.githubusercontent.com/ehuangc/repare/main/examples/algorithm_diagram.svg" alt="Reconstruction Process Diagram" />
|
|
104
|
+
<br>
|
|
105
|
+
<em>Diagram of repare's pedigree reconstruction process</em>
|
|
106
|
+
</p>
|
|
107
|
+
|
|
108
|
+
## Reproducibility
|
|
109
|
+
We recommend using [pixi](https://pixi.sh/) to reproduce the results in this repo.
|
|
110
|
+
```
|
|
111
|
+
git clone https://github.com/ehuangc/repare.git
|
|
112
|
+
cd repare
|
|
113
|
+
pixi shell
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Once in the pixi shell, you can run the script(s) corresponding to the results you'd like to reproduce. For example:
|
|
117
|
+
```
|
|
118
|
+
python benchmarks/published/run_parameter_experiment.py
|
|
119
|
+
exit
|
|
120
|
+
```
|
|
121
|
+
To install pixi, see [this page](https://pixi.sh/latest/installation/).
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
repare/__init__.py,sha256=esYDcCYXwJUJVVpWQFFpocPwCZzL5xo_Hxihbknfunc,138
|
|
2
|
+
repare/main.py,sha256=Jmdzzc2_XxBzQ5vU5l0Q0Sh_M714-EWpYCBPOqnng_k,2628
|
|
3
|
+
repare/pedigree.py,sha256=0YXNz2qeML63pgOyX6izaf0zcYIm61N5GU29816P1kg,70547
|
|
4
|
+
repare/pedigree_reconstructor.py,sha256=zbkc9uyd5uqKM9HCZVg6rvvEEo28i65PN-RybrQxILE,49359
|
|
5
|
+
repare-0.1.1.dist-info/licenses/LICENSE,sha256=uqhB_C7lgd3rOQU5SLtWeu_tVc_L0zGGdN488GCrtmY,1063
|
|
6
|
+
repare-0.1.1.dist-info/METADATA,sha256=Rwk4p3XZDOCPYsnbHMuinKM1juRbSf1iB-C1ClFwYZg,6608
|
|
7
|
+
repare-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
repare-0.1.1.dist-info/entry_points.txt,sha256=tWRppCTqmNN8n4hJ_ShCgO8dJFU4PKTQsexMZS-PFHw,44
|
|
9
|
+
repare-0.1.1.dist-info/top_level.txt,sha256=MBgnP6OarsEmlqLXjKcPqKFIMIdpwADg5vt6eMPVA0M,7
|
|
10
|
+
repare-0.1.1.dist-info/RECORD,,
|
repare-0.0.2.dist-info/METADATA
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: repare
|
|
3
|
-
Version: 0.0.2
|
|
4
|
-
Summary: Reconstruct ancient pedigrees.
|
|
5
|
-
Author-email: Edward Huang <edwardhuang02@gmail.com>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: matplotlib
|
|
11
|
-
Requires-Dist: networkx
|
|
12
|
-
Requires-Dist: pandas
|
|
13
|
-
Requires-Dist: tqdm
|
|
14
|
-
Provides-Extra: benchmark
|
|
15
|
-
Requires-Dist: scikit-learn; extra == "benchmark"
|
|
16
|
-
Requires-Dist: seaborn; extra == "benchmark"
|
|
17
|
-
Provides-Extra: plot
|
|
18
|
-
Requires-Dist: pygraphviz; extra == "plot"
|
|
19
|
-
Dynamic: license-file
|
|
20
|
-
|
|
21
|
-
**repare** is a Python package for (ancient) pedigree reconstruction.
|
|
22
|
-
|
|
23
|
-
## Installation
|
|
24
|
-
|
|
25
|
-
### Recommended
|
|
26
|
-
```
|
|
27
|
-
conda create -n "repare" -c conda-forge python=3.13 pygraphviz
|
|
28
|
-
conda activate repare
|
|
29
|
-
pip install repare
|
|
30
|
-
```
|
|
31
|
-
repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
|
|
32
|
-
|
|
33
|
-
If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
|
|
34
|
-
|
|
35
|
-
To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
|
repare-0.0.2.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
repare/__init__.py,sha256=esYDcCYXwJUJVVpWQFFpocPwCZzL5xo_Hxihbknfunc,138
|
|
2
|
-
repare/main.py,sha256=N33DO2NggJIuJ1-LrZZNiyd--niou7DJqiq30MFT-OY,2387
|
|
3
|
-
repare/pedigree.py,sha256=zPWmyzE1SheAnRxz_0ZWFMvm0fCcV1ng0fHED3aqtdU,60347
|
|
4
|
-
repare/pedigree_reconstructor.py,sha256=6SGdHyyI8uLsA1xmRg4L9Jb-h79R1JLQFRHUiY8R3DQ,42047
|
|
5
|
-
repare-0.0.2.dist-info/licenses/LICENSE,sha256=uqhB_C7lgd3rOQU5SLtWeu_tVc_L0zGGdN488GCrtmY,1063
|
|
6
|
-
repare-0.0.2.dist-info/METADATA,sha256=fDGofIvoNkr1tweFsI1NkzqxNZsBRBvYFVHVkX40yM8,1478
|
|
7
|
-
repare-0.0.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
8
|
-
repare-0.0.2.dist-info/entry_points.txt,sha256=tWRppCTqmNN8n4hJ_ShCgO8dJFU4PKTQsexMZS-PFHw,44
|
|
9
|
-
repare-0.0.2.dist-info/top_level.txt,sha256=MBgnP6OarsEmlqLXjKcPqKFIMIdpwADg5vt6eMPVA0M,7
|
|
10
|
-
repare-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|