repare 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of repare might be problematic. Click here for more details.

@@ -47,12 +47,13 @@ class PedigreeReconstructor:
47
47
  self._write_alternate_pedigrees = write_alternate_pedigrees
48
48
  self._random_seed = random_seed
49
49
  random.seed(self._random_seed)
50
+ self._validate_arguments()
50
51
 
51
52
  # Maximum number of times to run the algorithm if no valid pedigree is found
52
53
  self._MAX_RUNS = 10
53
54
  self._candidate_pedigrees: list[Pedigree] = [self._get_initial_pedigree()]
54
55
  self._pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = self._get_pair_to_constraints()
55
- self._final_pedigree: Pedigree | None = None
56
+ self._final_pedigrees: list[Pedigree] = []
56
57
 
57
58
  def _validate_node_data(self, nodes_path: str) -> None:
58
59
  """
@@ -71,6 +72,12 @@ class PedigreeReconstructor:
71
72
  if self._node_data["id"].str.isnumeric().any():
72
73
  raise ValueError("Sample IDs cannot be completely numeric.")
73
74
 
75
+ if self._node_data["id"].duplicated().any():
76
+ raise ValueError("Sample IDs must be unique.")
77
+
78
+ if self._node_data["id"].str.strip().eq("").any():
79
+ raise ValueError("Sample IDs cannot be empty.")
80
+
74
81
  if not self._node_data["sex"].isin(["M", "F"]).all():
75
82
  raise ValueError('Node sex must be "M" or "F".')
76
83
 
@@ -275,6 +282,15 @@ class PedigreeReconstructor:
275
282
  [self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
276
283
  ).reset_index(drop=True)
277
284
 
285
+ def _validate_arguments(self) -> None:
286
+ """
287
+ Validate constructor arguments.
288
+ """
289
+ if not isinstance(self._max_candidate_pedigrees, int) or self._max_candidate_pedigrees <= 0:
290
+ raise ValueError("max_candidate_pedigrees must be a positive integer.")
291
+ if not (0 <= self._epsilon <= 1):
292
+ raise ValueError("epsilon must be between 0 and 1.")
293
+
278
294
  def _shuffle_relations(self) -> None:
279
295
  """
280
296
  Shuffle relation DataFrames (when we want to restart the algorithm).
@@ -333,11 +349,14 @@ class PedigreeReconstructor:
333
349
  self._add_relation(
334
350
  node1, node2, degree=degree, constraints=constraints, force_constraints=force_constraints
335
351
  )
336
- self._clean_relation_dicts()
352
+ self._clean_pedigree_data()
353
+ self._validate_pedigree_structures()
337
354
 
338
355
  processed_relations = self._all_relations.iloc[: idx + 1]
339
356
  pair_to_relations_so_far = self._get_pair_to_relations_so_far(processed_relations)
340
357
  if degree == "1" and len(processed_relations) < len(self._first_and_second_degree_relations):
358
+ # Don't check for extraneous half-sibling relations because
359
+ # the 2 non-shared parents might be "merged" later
341
360
  self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=False)
342
361
  else:
343
362
  self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=True)
@@ -358,6 +377,7 @@ class PedigreeReconstructor:
358
377
  logger.error(f"No valid pedigree found after {self._MAX_RUNS} runs. Exiting.")
359
378
  raise RuntimeError(f"No valid pedigree found after {self._MAX_RUNS} runs.")
360
379
 
380
+ self._clean_pedigree_data()
361
381
  # Plot and write outputs of sample pedigree
362
382
  sample_idx = random.randint(0, len(self._final_pedigrees) - 1)
363
383
  self._sample_pedigree = self._final_pedigrees[sample_idx]
@@ -373,7 +393,7 @@ class PedigreeReconstructor:
373
393
  )
374
394
  if self._plot:
375
395
  try:
376
- self._sample_pedigree.plot(os.path.join(self._outputs_dir, "reconstructed_pedigree.png"))
396
+ self._sample_pedigree.plot(os.path.join(self._outputs_dir, "reconstructed_pedigree.pdf"))
377
397
  pygraphviz_found = True
378
398
  except ImportError:
379
399
  logger.warning(
@@ -386,7 +406,7 @@ class PedigreeReconstructor:
386
406
  if self._write_alternate_pedigrees:
387
407
  os.makedirs(os.path.join(self._outputs_dir, "alternate_pedigrees"), exist_ok=True)
388
408
  for idx, (pedigree, strike_count, strike_log) in enumerate(
389
- zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs)
409
+ zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs, strict=True)
390
410
  ):
391
411
  self._write_corrected_input_relations(
392
412
  strike_count,
@@ -402,6 +422,119 @@ class PedigreeReconstructor:
402
422
  pedigree.plot(os.path.join(self._outputs_dir, "alternate_pedigrees", f"pedigree_{idx}.png"))
403
423
  return self._sample_pedigree
404
424
 
425
+ @staticmethod
426
+ def _check_haplogroups(haplogroup1: str, haplogroup2: str) -> bool:
427
+ """
428
+ Checks if two haplogroups are compatible. Same semantics as pedigree.validate_haplogroups().
429
+ "*" is wild card character.
430
+ """
431
+ if not haplogroup1 or not haplogroup2: # empty OK
432
+ return True
433
+ haplogroup1_stripped, haplogroup2_stripped = haplogroup1.rstrip("*"), haplogroup2.rstrip("*")
434
+ return haplogroup1_stripped.startswith(haplogroup2_stripped) or haplogroup2_stripped.startswith(
435
+ haplogroup1_stripped
436
+ )
437
+
438
+ @staticmethod
439
+ def _check_parent_child_haplogroups(pedigree: Pedigree, parent: str, child: str) -> bool:
440
+ """
441
+ Checks if the haplogroups of a parent and child are compatible.
442
+ """
443
+ if pedigree.get_data(parent)["sex"] == "M" and pedigree.get_data(child)["sex"] == "M":
444
+ return PedigreeReconstructor._check_haplogroups(
445
+ pedigree.get_data(parent)["y_haplogroup"], pedigree.get_data(child)["y_haplogroup"]
446
+ )
447
+ if pedigree.get_data(parent)["sex"] == "F":
448
+ return PedigreeReconstructor._check_haplogroups(
449
+ pedigree.get_data(parent)["mt_haplogroup"], pedigree.get_data(child)["mt_haplogroup"]
450
+ )
451
+ return True
452
+
453
+ @staticmethod
454
+ def _check_sibling_haplogroups(pedigree: Pedigree, sibling1: str, sibling2: str) -> bool:
455
+ """
456
+ Checks if the haplogroups of two full siblings are compatible.
457
+ """
458
+ if pedigree.get_data(sibling1)["sex"] == "M" and pedigree.get_data(sibling2)["sex"] == "M":
459
+ # MT haplogroups still need to agree as well
460
+ if not PedigreeReconstructor._check_haplogroups(
461
+ pedigree.get_data(sibling1)["y_haplogroup"], pedigree.get_data(sibling2)["y_haplogroup"]
462
+ ):
463
+ return False
464
+ # All full siblings should share MT haplogroups
465
+ return PedigreeReconstructor._check_haplogroups(
466
+ pedigree.get_data(sibling1)["mt_haplogroup"], pedigree.get_data(sibling2)["mt_haplogroup"]
467
+ )
468
+
469
+ @staticmethod
470
+ def _check_aunt_uncle_nephew_niece_haplogroups(
471
+ pedigree: Pedigree, aunt_uncle: str, nephew_niece: str, shared_relative_sex: str | None
472
+ ) -> bool:
473
+ """
474
+ Checks if the haplogroups of an aunt/uncle and nephew/niece are compatible.
475
+ """
476
+ if not shared_relative_sex:
477
+ return True
478
+
479
+ if (
480
+ shared_relative_sex == "M"
481
+ and pedigree.get_data(aunt_uncle)["sex"] == "M"
482
+ and pedigree.get_data(nephew_niece)["sex"] == "M"
483
+ ):
484
+ return PedigreeReconstructor._check_haplogroups(
485
+ pedigree.get_data(aunt_uncle)["y_haplogroup"], pedigree.get_data(nephew_niece)["y_haplogroup"]
486
+ )
487
+ if shared_relative_sex == "F":
488
+ return PedigreeReconstructor._check_haplogroups(
489
+ pedigree.get_data(aunt_uncle)["mt_haplogroup"], pedigree.get_data(nephew_niece)["mt_haplogroup"]
490
+ )
491
+ return True
492
+
493
+ @staticmethod
494
+ def _check_grandparent_grandchild_haplogroups(
495
+ pedigree: Pedigree, grandparent: str, grandchild: str, shared_relative_sex: str | None
496
+ ) -> bool:
497
+ """
498
+ Checks if the haplogroups of a grandparent and grandchild are compatible.
499
+ """
500
+ if not shared_relative_sex:
501
+ return True
502
+
503
+ if (
504
+ shared_relative_sex == "M"
505
+ and pedigree.get_data(grandparent)["sex"] == "M"
506
+ and pedigree.get_data(grandchild)["sex"] == "M"
507
+ ):
508
+ return PedigreeReconstructor._check_haplogroups(
509
+ pedigree.get_data(grandparent)["y_haplogroup"], pedigree.get_data(grandchild)["y_haplogroup"]
510
+ )
511
+ if shared_relative_sex == "F" and pedigree.get_data(grandparent)["sex"] == "F":
512
+ return PedigreeReconstructor._check_haplogroups(
513
+ pedigree.get_data(grandparent)["mt_haplogroup"], pedigree.get_data(grandchild)["mt_haplogroup"]
514
+ )
515
+ return True
516
+
517
+ @staticmethod
518
+ def _check_half_sibling_haplogroups(
519
+ pedigree: Pedigree, half_sibling1: str, half_sibling2: str, shared_relative_sex: str | None
520
+ ) -> bool:
521
+ """
522
+ Checks if the haplogroups of two half-siblings are compatible.
523
+ """
524
+ if (
525
+ shared_relative_sex == "M"
526
+ and pedigree.get_data(half_sibling1)["sex"] == "M"
527
+ and pedigree.get_data(half_sibling2)["sex"] == "M"
528
+ ):
529
+ return PedigreeReconstructor._check_haplogroups(
530
+ pedigree.get_data(half_sibling1)["y_haplogroup"], pedigree.get_data(half_sibling2)["y_haplogroup"]
531
+ )
532
+ if shared_relative_sex == "F":
533
+ return PedigreeReconstructor._check_haplogroups(
534
+ pedigree.get_data(half_sibling1)["mt_haplogroup"], pedigree.get_data(half_sibling2)["mt_haplogroup"]
535
+ )
536
+ return True
537
+
405
538
  def _add_relation(self, node1: str, node2: str, degree: str, constraints: str, force_constraints: bool) -> None:
406
539
  """
407
540
  Connects two nodes in every pedigree.
@@ -451,7 +584,7 @@ class PedigreeReconstructor:
451
584
  self._candidate_pedigrees = new_pedigrees
452
585
 
453
586
  @staticmethod
454
- def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> None:
587
+ def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
455
588
  """
456
589
  Update pedigree with a first-degree relation.
457
590
  """
@@ -470,7 +603,7 @@ class PedigreeReconstructor:
470
603
  return new_pedigrees
471
604
 
472
605
  @staticmethod
473
- def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> None:
606
+ def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
474
607
  """
475
608
  Update pedigree with a second-degree relation.
476
609
  """
@@ -536,16 +669,20 @@ class PedigreeReconstructor:
536
669
  """
537
670
  assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
538
671
 
672
+ # Pre-check invalid relations to avoid unnecessary deep-copying
673
+ if not PedigreeReconstructor._check_parent_child_haplogroups(pedigree, node1, node2):
674
+ return []
675
+
539
676
  ret: list[Pedigree] = []
540
677
  new_pedigree = copy.deepcopy(pedigree)
541
678
  new_pedigree.fill_node_parents(node2)
542
679
  original_parent: str
543
- if new_pedigree.node_to_data[node1]["sex"] == "M":
544
- original_parent = new_pedigree.node_to_father[node2]
680
+ if new_pedigree.get_data(node1)["sex"] == "M":
681
+ original_parent = new_pedigree.get_father(node2)
545
682
  else:
546
- original_parent = new_pedigree.node_to_mother[node2]
683
+ original_parent = new_pedigree.get_mother(node2)
547
684
 
548
- if not new_pedigree.check_cycles_if_merged(node1, original_parent):
685
+ if new_pedigree.check_valid_merge(node1, original_parent):
549
686
  new_pedigree.merge_nodes(node1, original_parent)
550
687
  ret.append(new_pedigree)
551
688
  return ret
@@ -558,18 +695,22 @@ class PedigreeReconstructor:
558
695
  """
559
696
  assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
560
697
 
698
+ # Pre-check invalid relations to avoid unnecessary deep-copying
699
+ if not PedigreeReconstructor._check_sibling_haplogroups(pedigree, node1, node2):
700
+ return []
701
+
561
702
  ret: list[Pedigree] = []
562
703
  new_pedigree = copy.deepcopy(pedigree)
563
704
  new_pedigree.fill_node_parents(node1)
564
705
  new_pedigree.fill_node_parents(node2)
565
706
 
566
- father1 = new_pedigree.node_to_father[node1]
567
- father2 = new_pedigree.node_to_father[node2]
568
- if not new_pedigree.check_cycles_if_merged(father1, father2):
707
+ father1 = new_pedigree.get_father(node1)
708
+ father2 = new_pedigree.get_father(node2)
709
+ if new_pedigree.check_valid_merge(father1, father2):
569
710
  new_pedigree.merge_nodes(father1, father2)
570
- mother1 = new_pedigree.node_to_mother[node1]
571
- mother2 = new_pedigree.node_to_mother[node2]
572
- if not new_pedigree.check_cycles_if_merged(mother1, mother2):
711
+ mother1 = new_pedigree.get_mother(node1)
712
+ mother2 = new_pedigree.get_mother(node2)
713
+ if new_pedigree.check_valid_merge(mother1, mother2):
573
714
  new_pedigree.merge_nodes(mother1, mother2)
574
715
  new_pedigree.add_sibling_relation(node1, node2)
575
716
  ret.append(new_pedigree)
@@ -586,17 +727,23 @@ class PedigreeReconstructor:
586
727
  assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
587
728
  assert shared_relative_sex in ["M", "F", None]
588
729
 
730
+ # Pre-check invalid relations to avoid unnecessary deep-copying
731
+ if not PedigreeReconstructor._check_aunt_uncle_nephew_niece_haplogroups(
732
+ pedigree, node1, node2, shared_relative_sex
733
+ ):
734
+ return []
735
+
589
736
  ret: list[Pedigree] = []
590
737
  new_pedigree = copy.deepcopy(pedigree)
591
738
  new_pedigree.fill_node_parents(node2)
592
739
 
593
740
  node2_parents: list[str]
594
741
  if shared_relative_sex == "M":
595
- node2_parents = [new_pedigree.node_to_father[node2]]
742
+ node2_parents = [new_pedigree.get_father(node2)]
596
743
  elif shared_relative_sex == "F":
597
- node2_parents = [new_pedigree.node_to_mother[node2]]
744
+ node2_parents = [new_pedigree.get_mother(node2)]
598
745
  else:
599
- node2_parents = [new_pedigree.node_to_father[node2], new_pedigree.node_to_mother[node2]]
746
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
600
747
 
601
748
  for node2_parent in node2_parents:
602
749
  if node1 != node2_parent:
@@ -614,17 +761,23 @@ class PedigreeReconstructor:
614
761
  assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
615
762
  assert shared_relative_sex in ["M", "F", None]
616
763
 
764
+ # Pre-check invalid relations to avoid unnecessary deep-copying
765
+ if not PedigreeReconstructor._check_grandparent_grandchild_haplogroups(
766
+ pedigree, node1, node2, shared_relative_sex
767
+ ):
768
+ return []
769
+
617
770
  ret: list[Pedigree] = []
618
771
  new_pedigree = copy.deepcopy(pedigree)
619
772
  new_pedigree.fill_node_parents(node2)
620
773
 
621
774
  node2_parents: list[str]
622
775
  if shared_relative_sex == "M":
623
- node2_parents = [new_pedigree.node_to_father[node2]]
776
+ node2_parents = [new_pedigree.get_father(node2)]
624
777
  elif shared_relative_sex == "F":
625
- node2_parents = [new_pedigree.node_to_mother[node2]]
778
+ node2_parents = [new_pedigree.get_mother(node2)]
626
779
  else:
627
- node2_parents = [new_pedigree.node_to_father[node2], new_pedigree.node_to_mother[node2]]
780
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
628
781
 
629
782
  for node2_parent in node2_parents:
630
783
  if node1 != node2_parent:
@@ -641,6 +794,10 @@ class PedigreeReconstructor:
641
794
  """
642
795
  assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
643
796
 
797
+ # Pre-check invalid relations to avoid unnecessary deep-copying
798
+ if not PedigreeReconstructor._check_half_sibling_haplogroups(pedigree, node1, node2, shared_relative_sex):
799
+ return []
800
+
644
801
  ret: list[Pedigree] = []
645
802
  new_pedigree = copy.deepcopy(pedigree)
646
803
  new_pedigree.fill_node_parents(node1)
@@ -649,14 +806,14 @@ class PedigreeReconstructor:
649
806
  node1_parents: list[str]
650
807
  node2_parents: list[str]
651
808
  if shared_relative_sex == "M":
652
- node1_parents = [new_pedigree.node_to_father[node1]]
653
- node2_parents = [new_pedigree.node_to_father[node2]]
809
+ node1_parents = [new_pedigree.get_father(node1)]
810
+ node2_parents = [new_pedigree.get_father(node2)]
654
811
  elif shared_relative_sex == "F":
655
- node1_parents = [new_pedigree.node_to_mother[node1]]
656
- node2_parents = [new_pedigree.node_to_mother[node2]]
812
+ node1_parents = [new_pedigree.get_mother(node1)]
813
+ node2_parents = [new_pedigree.get_mother(node2)]
657
814
  else:
658
- node1_parents = [new_pedigree.node_to_father[node1], new_pedigree.node_to_mother[node1]]
659
- node2_parents = [new_pedigree.node_to_father[node2], new_pedigree.node_to_mother[node2]]
815
+ node1_parents = [new_pedigree.get_father(node1), new_pedigree.get_mother(node1)]
816
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
660
817
 
661
818
  # Node 1 and Node 2 are half-siblings via one of Node 1's parents
662
819
  for node1_parent in node1_parents:
@@ -668,12 +825,22 @@ class PedigreeReconstructor:
668
825
  ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node2_parent, node1))
669
826
  return ret
670
827
 
671
- def _clean_relation_dicts(self) -> None:
828
+ def _clean_pedigree_data(self) -> None:
672
829
  """
673
830
  Remove unnecessary entries in Pedigree dicts.
674
831
  """
675
832
  for pedigree in self._candidate_pedigrees:
676
- pedigree.clean_up_relations()
833
+ pedigree.clean_data()
834
+
835
+ for pedigree in self._final_pedigrees:
836
+ pedigree.clean_data()
837
+
838
+ def _validate_pedigree_structures(self) -> None:
839
+ """
840
+ Validate that all candidate pedigrees are consistent.
841
+ """
842
+ for pedigree in self._candidate_pedigrees:
843
+ assert pedigree.validate_structure()
677
844
 
678
845
  def _get_pair_to_constraints(self) -> defaultdict[tuple[str, str], list[tuple[str, ...]]]:
679
846
  """
@@ -730,12 +897,12 @@ class PedigreeReconstructor:
730
897
 
731
898
  strikes = []
732
899
  third_degree_strikes = []
733
- counts = defaultdict(int)
900
+ counts: defaultdict[int, int] = defaultdict(int)
734
901
  for pedigree in new_potential_pedigrees:
735
902
  num_strikes, _ = pedigree.count_inconsistencies(
736
903
  self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings
737
904
  )
738
- num_third_degree_strikes = pedigree.count_third_degree_inconcistencies(self._pair_to_constraints)
905
+ num_third_degree_strikes = pedigree.count_third_degree_inconsistencies(self._pair_to_constraints)
739
906
  strikes.append(num_strikes)
740
907
  third_degree_strikes.append(num_third_degree_strikes)
741
908
  counts[num_strikes] += 1
@@ -754,7 +921,9 @@ class PedigreeReconstructor:
754
921
 
755
922
  sorted_pedigrees = [
756
923
  pedigree
757
- for pedigree, _, _ in sorted(zip(pedigrees, strikes, third_degree_strikes), key=lambda x: (x[1], x[2]))
924
+ for pedigree, _, _ in sorted(
925
+ zip(pedigrees, strikes, third_degree_strikes, strict=True), key=lambda x: (x[1], x[2])
926
+ )
758
927
  ]
759
928
  exploitation_max_candidate_pedigrees = int((1 - epsilon) * max_candidate_pedigrees)
760
929
  exploration_max_candidate_pedigrees = max_candidate_pedigrees - exploitation_max_candidate_pedigrees
@@ -778,19 +947,21 @@ class PedigreeReconstructor:
778
947
  # Final iteration
779
948
  best_pedigrees = [
780
949
  pedigree
781
- for pedigree, num_strikes in zip(new_potential_pedigrees, strikes)
950
+ for pedigree, num_strikes in zip(new_potential_pedigrees, strikes, strict=True)
782
951
  if num_strikes == min(strikes)
783
952
  ]
784
953
  # Use 3rd-degree strikes as tiebreaker
785
954
  third_degree_strikes = [
786
- pedigree.count_third_degree_inconcistencies(self._pair_to_constraints) for pedigree in best_pedigrees
955
+ pedigree.count_third_degree_inconsistencies(self._pair_to_constraints) for pedigree in best_pedigrees
787
956
  ]
788
957
 
789
- self._final_pedigrees = [
790
- pedigree
791
- for pedigree, num_strikes in zip(best_pedigrees, third_degree_strikes)
792
- if num_strikes == min(third_degree_strikes)
793
- ]
958
+ self._final_pedigrees.extend(
959
+ [
960
+ pedigree
961
+ for pedigree, num_strikes in zip(best_pedigrees, third_degree_strikes, strict=True)
962
+ if num_strikes == min(third_degree_strikes)
963
+ ]
964
+ )
794
965
  self._final_strike_counts = []
795
966
  self._final_strike_logs = []
796
967
  for pedigree in self._final_pedigrees:
@@ -799,10 +970,9 @@ class PedigreeReconstructor:
799
970
  )
800
971
  self._final_strike_counts.append(strike_count)
801
972
  self._final_strike_logs.append(strike_log)
802
- pedigree.clean_up_relations()
803
973
 
804
974
  def _write_corrected_input_relations(
805
- self, strike_count: int, strike_log: list[tuple[str, str, str]], path: str
975
+ self, strike_count: int, strike_log: list[tuple[str, str, str, str]], path: str
806
976
  ) -> None:
807
977
  """
808
978
  Write corrected input relations to file. Includes information about added/removed/changed input relations.
@@ -827,7 +997,7 @@ class PedigreeReconstructor:
827
997
 
828
998
  with open(path, "w") as file:
829
999
  file.write("id1,id2,degree,constraints\n") # Header line
830
- file.write(f"# Final strike count: {strike_count}\n")
1000
+ file.write(f"# Final inconsistency count: {strike_count}\n")
831
1001
 
832
1002
  def write_relations_line(node1, node2, degree, constraints, commented=False):
833
1003
  if constraints == self._DEFAULT_CONSTRAINTS[degree]:
@@ -0,0 +1,120 @@
1
+ Metadata-Version: 2.4
2
+ Name: repare
3
+ Version: 0.1.0
4
+ Summary: Reconstruct (ancient) pedigrees from pairwise kinship relations.
5
+ Author-email: Edward Huang <edwardhuangc@gmail.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: matplotlib
11
+ Requires-Dist: networkx
12
+ Requires-Dist: pandas
13
+ Requires-Dist: tqdm
14
+ Provides-Extra: benchmark
15
+ Requires-Dist: scikit-learn; extra == "benchmark"
16
+ Requires-Dist: seaborn; extra == "benchmark"
17
+ Provides-Extra: plot
18
+ Requires-Dist: pygraphviz; extra == "plot"
19
+ Dynamic: license-file
20
+
21
+ :evergreen_tree: **repare** is a Python package for (ancient) pedigree reconstruction.
22
+
23
+ ## Installation
24
+
25
+ ### Recommended
26
+ ```
27
+ conda create -n "repare" -c conda-forge python=3.13 pygraphviz matplotlib networkx pandas tqdm
28
+ conda activate repare
29
+ pip install repare
30
+ ```
31
+ repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment, as shown above.
32
+
33
+ If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
34
+
35
+ To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
36
+
37
+
38
+ ## Usage
39
+
40
+ We recommend running repare through its command-line interface.
41
+ ```
42
+ repare -n NODES -r RELATIONS [-o OUTPUT] [-m MAX_CANDIDATE_PEDIGREES] [-e EPSILON] [-s SEED] [-d] [-w] [-v]
43
+ ```
44
+
45
+ > [!NOTE]
46
+ > Minimal command:
47
+ > ```
48
+ > repare -n nodes.csv -r relations.csv
49
+ > ```
50
+ > For example data inputs, see [examples/nodes.csv](examples/nodes.csv) and [examples/relations.csv](examples/relations.csv).
51
+
52
+ ### Inputs
53
+ **Nodes** (-n) (*required*): Path to a CSV file that contains information about the individuals to be analyzed by repare.
54
+
55
+ <dl>
56
+ <dd>
57
+ <details open>
58
+ <summary><ins>Nodes CSV file columns</ins></summary>
59
+
60
+ - **id** *(required)*: ID of individual. Cannot be fully numeric, as numeric IDs are reserved for placeholder nodes.
61
+ - **sex** *(required)*: Genetic sex of individual. Value must be "M" or "F".
62
+ - **y_haplogroup** *(required)*: Y chromosome haplogroup of individual. Can include "*" as a wildcard expansion character at the end if haplogroup is not fully inferred.
63
+ - **mt_haplogroup** *(required)*: Mitochondrial haplogroup of individual. Can include "*" as a wildcard expansion character at the end if haplogroup is not fully inferred.
64
+ - **can_have_children** *(optional)*: Whether the individual *can* have offspring (e.g., as indicated by age of death). If provided, value must be "True" or "False". Defaults to "True".
65
+ - **can_be_inbred** *(optional)*: Whether the individual *can* have parents related at the 3rd-degree or closer (e.g., as indicated by ROH). If provided, value must be "True" or "False". Defaults to "True".
66
+ - **years_before_present** *(optional)*: (Approximate) date of birth of individual, in years before present. If provided, will be used to prune temporally invalid pedigrees. *This column should only be used when backed by strong dating evidence.*
67
+ </details>
68
+ </dd>
69
+ </dl>
70
+
71
+ **Relations** (-r) (*required*): Path to a CSV file that contains information about inferred pairwise kinship relations. Methods to infer these kinship relations include [KIN](https://doi.org/10.1186/s13059-023-02847-7) and [READv2](https://doi.org/10.1186/s13059-024-03350-3). All individuals included in this file must be specified in the nodes CSV.
72
+
73
+ <dl>
74
+ <dd>
75
+ <details open>
76
+ <summary><ins>Relations CSV file columns</ins></summary>
77
+
78
+ - **id1** *(required)*: ID of individual 1.
79
+ - **id2** *(required)*: ID of individual 2.
80
+ - **degree** *(required)*: Degree of (inferred) kinship relation between individual 1 and individual 2. Value must be "1", "2", or "3". Higher-degree relatives are considered unrelated.
81
+ - **constraints** *(optional)*: Semicolon-delimited list of possible configurations of kinship relation. For example, a parental 1st-degree relation can be constrained with "parent-child;child-parent". Many kinship inference methods will classify 1st-degree relation types, which can be used as relation constraints. Valid constraints: "parent-child", "child-parent", "siblings", "maternal aunt/uncle-nephew/niece", "maternal nephew/niece-aunt/uncle", "paternal aunt/uncle-nephew/niece", "paternal nephew/niece-aunt/uncle", "maternal grandparent-grandchild", "maternal grandchild-grandparent", "paternal grandparent-grandchild", "paternal grandchild-grandparent" "maternal half-siblings", "paternal half-siblings".
82
+ - **force_constraints** *(optional)*: Whether the corresponding constraint should be forced. If provided, value must be "True" or "False". If "True", the constraint must be followed. If "False", breaking the constraint counts as one inconsistency. Defaults to "False".
83
+ </details>
84
+ </dd>
85
+ </dl>
86
+
87
+ **Output** (-o) (*optional*): Path to directory for saving repare outputs. Defaults to the current working directory.
88
+
89
+ **Max Candidate Pedigrees** (-m) (*optional*): Maximum number of candidate pedigrees to keep after each algorithm iteration. Defaults to 1000.
90
+
91
+ **Epsilon** (-e) (*optional*): Parameter for adapted epsilon-greedy sampling at the end of each algorithm iteration. Defaults to 0.2.
92
+
93
+ **Seed** (-s) (*optional*): Random seed for reproducibility. Defaults to 42.
94
+
95
+ **Do Not Plot** (-d) (*flag*): If set, do not plot reconstructed pedigree(s).
96
+
97
+ **Write Alternate Pedigrees** (-w) (*flag*): If set, write outputs for alternate reconstructed pedigrees to disk.
98
+
99
+ **Verbose** (-v) (*flag*): If set, enable verbose output (INFO-level logging).
100
+
101
+ <p align="center">
102
+ <img src="examples/algorithm_diagram.svg" alt="Reconstruction Process Diagram" width="600" />
103
+ <br>
104
+ <em>Diagram of repare's pedigree reconstruction process</em>
105
+ </p>
106
+
107
+ ## Reproducibility
108
+ We recommend using [pixi](https://pixi.sh/) to reproduce the results in this repo.
109
+ ```
110
+ git clone https://github.com/ehuangc/repare.git
111
+ cd repare
112
+ pixi shell
113
+ ```
114
+
115
+ Once in the pixi shell, you can run the script(s) corresponding to the results you'd like to reproduce. For example:
116
+ ```
117
+ python benchmarks/published/run_parameter_experiment.py
118
+ exit
119
+ ```
120
+ To install pixi, see [this page](https://pixi.sh/latest/installation/).
@@ -0,0 +1,10 @@
1
+ repare/__init__.py,sha256=esYDcCYXwJUJVVpWQFFpocPwCZzL5xo_Hxihbknfunc,138
2
+ repare/main.py,sha256=Jmdzzc2_XxBzQ5vU5l0Q0Sh_M714-EWpYCBPOqnng_k,2628
3
+ repare/pedigree.py,sha256=0YXNz2qeML63pgOyX6izaf0zcYIm61N5GU29816P1kg,70547
4
+ repare/pedigree_reconstructor.py,sha256=zbkc9uyd5uqKM9HCZVg6rvvEEo28i65PN-RybrQxILE,49359
5
+ repare-0.1.0.dist-info/licenses/LICENSE,sha256=uqhB_C7lgd3rOQU5SLtWeu_tVc_L0zGGdN488GCrtmY,1063
6
+ repare-0.1.0.dist-info/METADATA,sha256=-tPVXVOv-jdHobgF5kXa4MaibmwgYjQdPgf1MpIxHz4,6523
7
+ repare-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ repare-0.1.0.dist-info/entry_points.txt,sha256=tWRppCTqmNN8n4hJ_ShCgO8dJFU4PKTQsexMZS-PFHw,44
9
+ repare-0.1.0.dist-info/top_level.txt,sha256=MBgnP6OarsEmlqLXjKcPqKFIMIdpwADg5vt6eMPVA0M,7
10
+ repare-0.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,35 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: repare
3
- Version: 0.0.2
4
- Summary: Reconstruct ancient pedigrees.
5
- Author-email: Edward Huang <edwardhuang02@gmail.com>
6
- License-Expression: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: matplotlib
11
- Requires-Dist: networkx
12
- Requires-Dist: pandas
13
- Requires-Dist: tqdm
14
- Provides-Extra: benchmark
15
- Requires-Dist: scikit-learn; extra == "benchmark"
16
- Requires-Dist: seaborn; extra == "benchmark"
17
- Provides-Extra: plot
18
- Requires-Dist: pygraphviz; extra == "plot"
19
- Dynamic: license-file
20
-
21
- **repare** is a Python package for (ancient) pedigree reconstruction.
22
-
23
- ## Installation
24
-
25
- ### Recommended
26
- ```
27
- conda create -n "repare" -c conda-forge python=3.13 pygraphviz
28
- conda activate repare
29
- pip install repare
30
- ```
31
- repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
32
-
33
- If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
34
-
35
- To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
@@ -1,10 +0,0 @@
1
- repare/__init__.py,sha256=esYDcCYXwJUJVVpWQFFpocPwCZzL5xo_Hxihbknfunc,138
2
- repare/main.py,sha256=N33DO2NggJIuJ1-LrZZNiyd--niou7DJqiq30MFT-OY,2387
3
- repare/pedigree.py,sha256=zPWmyzE1SheAnRxz_0ZWFMvm0fCcV1ng0fHED3aqtdU,60347
4
- repare/pedigree_reconstructor.py,sha256=6SGdHyyI8uLsA1xmRg4L9Jb-h79R1JLQFRHUiY8R3DQ,42047
5
- repare-0.0.2.dist-info/licenses/LICENSE,sha256=uqhB_C7lgd3rOQU5SLtWeu_tVc_L0zGGdN488GCrtmY,1063
6
- repare-0.0.2.dist-info/METADATA,sha256=fDGofIvoNkr1tweFsI1NkzqxNZsBRBvYFVHVkX40yM8,1478
7
- repare-0.0.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
8
- repare-0.0.2.dist-info/entry_points.txt,sha256=tWRppCTqmNN8n4hJ_ShCgO8dJFU4PKTQsexMZS-PFHw,44
9
- repare-0.0.2.dist-info/top_level.txt,sha256=MBgnP6OarsEmlqLXjKcPqKFIMIdpwADg5vt6eMPVA0M,7
10
- repare-0.0.2.dist-info/RECORD,,