repare 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repare/pedigree.py ADDED
@@ -0,0 +1,1484 @@
1
+ import importlib
2
+ import math
3
+ from collections import defaultdict, deque
4
+
5
+ import matplotlib.pyplot as plt
6
+ import networkx as nx
7
+ from matplotlib.colors import to_rgba
8
+
9
+
10
+ class Pedigree:
11
+ """
12
+ Describes a pedigree configuration for a set of nodes.
13
+ """
14
+
15
+ def __init__(self) -> None:
16
+ self.num_placeholders: int = 0
17
+ self.node_to_data: dict[str, dict[str, str | bool | float]] = dict()
18
+ self.node_to_father: dict[str, str] = dict()
19
+ self.node_to_mother: dict[str, str] = dict()
20
+ self.node_to_children: dict[str, set[str]] = dict()
21
+ self.node_to_siblings: dict[str, set[str]] = dict()
22
+
23
+ def __deepcopy__(self, memo: dict) -> "Pedigree":
24
+ """
25
+ Custom (faster) deepcopy implementation.
26
+ """
27
+ cls = self.__class__
28
+ new_pedigree = cls.__new__(cls)
29
+ memo[id(self)] = new_pedigree
30
+
31
+ new_pedigree.num_placeholders = self.num_placeholders
32
+ new_pedigree.node_to_data = dict()
33
+ for k, v in self.node_to_data.items():
34
+ new_pedigree.node_to_data[k] = v.copy()
35
+
36
+ new_pedigree.node_to_father = self.node_to_father.copy()
37
+ new_pedigree.node_to_mother = self.node_to_mother.copy()
38
+ new_pedigree.node_to_children = dict()
39
+ for k, v in self.node_to_children.items():
40
+ new_pedigree.node_to_children[k] = v.copy()
41
+ new_pedigree.node_to_siblings = dict()
42
+ for k, v in self.node_to_siblings.items():
43
+ new_pedigree.node_to_siblings[k] = v.copy()
44
+ return new_pedigree
45
+
46
+ def get_topo_sort(self) -> tuple[str, ...]:
47
+ """
48
+ Gets pedigree topological sort of the Pedigree. See https://doi.org/10.1089/cmb.2011.0254.
49
+ """
50
+ leaf_nodes: list[str] = sorted([node for node in self.node_to_data if not self.get_children(node)])
51
+ result: list[str] = []
52
+
53
+ def dfs(node: str) -> None:
54
+ # No father == no mother
55
+ if not self.get_father(node):
56
+ assert not self.get_mother(node)
57
+ result.append(node)
58
+ else:
59
+ # Visit father first
60
+ dfs(self.get_father(node))
61
+ dfs(self.get_mother(node))
62
+ result.append(node)
63
+
64
+ for node in leaf_nodes:
65
+ dfs(node)
66
+ # Break between each node's path
67
+ result.append("")
68
+
69
+ # Re-label placeholder nodes
70
+ placeholder_to_idx: dict[str, int] = {}
71
+ for i, node in enumerate(result):
72
+ if node.isnumeric():
73
+ if node not in placeholder_to_idx:
74
+ placeholder_to_idx[node] = len(placeholder_to_idx)
75
+
76
+ node_data = self.get_data(node)
77
+ mt_haplogroup = node_data["mt_haplogroup"]
78
+ if node_data["sex"] == "M":
79
+ y_haplogroup = node_data["y_haplogroup"]
80
+ # Unique identifier for placeholder
81
+ result[i] = f"M {placeholder_to_idx[node]} {mt_haplogroup} {y_haplogroup}"
82
+ else:
83
+ result[i] = f"F {placeholder_to_idx[node]} {mt_haplogroup}"
84
+ return tuple(result)
85
+
86
+ def get_data(self, node: str) -> dict[str, str | bool | float]:
87
+ """
88
+ Returns the data for the given node.
89
+ """
90
+ assert node in self.node_to_data
91
+ return self.node_to_data[node]
92
+
93
+ def get_father(self, node: str) -> str:
94
+ """
95
+ Returns the father of the given node.
96
+ If the node has no father, returns "".
97
+ """
98
+ assert node in self.node_to_data
99
+ return self.node_to_father.get(node, "")
100
+
101
+ def get_mother(self, node: str) -> str:
102
+ """
103
+ Returns the mother of the given node.
104
+ If the node has no mother, returns "".
105
+ """
106
+ assert node in self.node_to_data
107
+ return self.node_to_mother.get(node, "")
108
+
109
+ def get_children(self, node: str) -> set[str]:
110
+ """
111
+ Returns the children of the given node.
112
+ If the node has no children, returns an empty set.
113
+ """
114
+ assert node in self.node_to_data
115
+ return self.node_to_children.get(node, set())
116
+
117
+ def get_siblings(self, node: str) -> set[str]:
118
+ """
119
+ Returns the siblings of the given node.
120
+ If the node has no siblings, returns an empty set.
121
+ """
122
+ assert node in self.node_to_data
123
+ return self.node_to_siblings.get(node, set())
124
+
125
+ def add_node(
126
+ self,
127
+ node_id: str,
128
+ sex: str,
129
+ y_haplogroup: str,
130
+ mt_haplogroup: str,
131
+ can_have_children: bool,
132
+ can_be_inbred: bool,
133
+ years_before_present: float,
134
+ ) -> None:
135
+ """
136
+ Add a node to the pedigree. If haplogroup is unknown, set argument to empty string ("").
137
+ """
138
+ self.node_to_data[node_id] = dict()
139
+ self.node_to_data[node_id]["sex"] = sex
140
+ if y_haplogroup and sex == "F":
141
+ raise ValueError("Only males can have y_haplogroup values.")
142
+ self.node_to_data[node_id]["y_haplogroup"] = y_haplogroup
143
+ self.node_to_data[node_id]["mt_haplogroup"] = mt_haplogroup
144
+ self.node_to_data[node_id]["can_have_children"] = can_have_children
145
+ self.node_to_data[node_id]["can_be_inbred"] = can_be_inbred
146
+ self.node_to_data[node_id]["years_before_present"] = years_before_present
147
+
148
+ def add_parent_relation(self, node1: str, node2: str) -> None:
149
+ """
150
+ Adds a parent-child relationship to the tree.
151
+ Node 1 is the parent and Node 2 is the child.
152
+ Note: Overwrites existing parent, does not merge.
153
+ """
154
+ assert node1 != node2
155
+ assert node1 in self.node_to_data and node2 in self.node_to_data
156
+
157
+ def clear_siblings(node: str) -> None:
158
+ for sibling in self.get_siblings(node):
159
+ self.node_to_siblings[sibling].remove(node)
160
+ if node in self.node_to_siblings:
161
+ del self.node_to_siblings[node]
162
+
163
+ # Remove child from original parent
164
+ if self.get_data(node1)["sex"] == "M":
165
+ if self.get_father(node2):
166
+ self.node_to_children[self.get_father(node2)].remove(node2)
167
+ clear_siblings(node2)
168
+ self.node_to_father[node2] = node1
169
+ else:
170
+ if self.get_mother(node2):
171
+ self.node_to_children[self.get_mother(node2)].remove(node2)
172
+ clear_siblings(node2)
173
+ self.node_to_mother[node2] = node1
174
+
175
+ if node1 not in self.node_to_children:
176
+ self.node_to_children[node1] = set()
177
+ self.node_to_children[node1].add(node2)
178
+
179
+ # Add any sibling relations that are created
180
+ node2_parents = [self.get_father(node2), self.get_mother(node2)]
181
+ # Make sure Node 2 has 2 known parents
182
+ if node2_parents[0] and node2_parents[1]:
183
+ for node1_child in self.get_children(node1):
184
+ if (
185
+ node1_child != node2
186
+ and [self.get_father(node1_child), self.get_mother(node1_child)] == node2_parents
187
+ ):
188
+ self.add_sibling_relation(node1_child, node2)
189
+
190
+ def add_sibling_relation(self, node1: str, node2: str) -> None:
191
+ """
192
+ Adds a sibling relationship to the tree.
193
+ Note: Does not merge parents.
194
+ """
195
+ assert node1 != node2
196
+ assert node1 in self.node_to_data and node2 in self.node_to_data
197
+
198
+ if node1 not in self.node_to_siblings:
199
+ self.node_to_siblings[node1] = set()
200
+ if node2 not in self.node_to_siblings:
201
+ self.node_to_siblings[node2] = set()
202
+ self.node_to_siblings[node1].add(node2)
203
+ self.node_to_siblings[node2].add(node1)
204
+
205
+ # Update siblings of siblings
206
+ for node1_sibling in self.get_siblings(node1):
207
+ if node1_sibling != node2:
208
+ self.node_to_siblings[node1_sibling].add(node2)
209
+ self.node_to_siblings[node2].add(node1_sibling)
210
+ for node2_sibling in self.get_siblings(node2):
211
+ if node1 != node2_sibling:
212
+ self.node_to_siblings[node1].add(node2_sibling)
213
+ self.node_to_siblings[node2_sibling].add(node1)
214
+
215
+ def merge_nodes(self, node1: str, node2: str) -> bool:
216
+ """
217
+ Merge the two nodes as if they were one person. Note this involves merging the nodes' ancestors.
218
+ Returns True if the merge was successful, False if it was invalid.
219
+ """
220
+ assert node1 in self.node_to_data and node2 in self.node_to_data
221
+
222
+ # Pairs of nodes to merge
223
+ pair_queue: deque[tuple[str, str]] = deque([(node1, node2)])
224
+ while pair_queue:
225
+ node1, node2 = pair_queue.popleft()
226
+ if node1 == node2:
227
+ continue
228
+
229
+ # Cannot merge two named nodes
230
+ if not node1.isnumeric() and not node2.isnumeric():
231
+ return False
232
+
233
+ # Cannot merge nodes if it will create a node that is the parent and sibling of another
234
+ # For example, don't merge a parent and child if the child has siblings,
235
+ # and don't merge a node's sibling and child
236
+ combined_parents = set(
237
+ [
238
+ self.get_father(node1),
239
+ self.get_mother(node1),
240
+ self.get_father(node2),
241
+ self.get_mother(node2),
242
+ ]
243
+ )
244
+ combined_children = self.get_children(node1) | self.get_children(node2)
245
+ combined_siblings = self.get_siblings(node1) | self.get_siblings(node2)
246
+ if (combined_parents & combined_siblings) or (combined_children & combined_siblings):
247
+ return False
248
+
249
+ name_to_keep = node1 if not node1.isnumeric() else node2
250
+ name_to_discard = node2 if name_to_keep == node1 else node1
251
+
252
+ # Update relations for relatives of the discarded node
253
+ name_to_discard_father = self.get_father(name_to_discard)
254
+ name_to_discard_mother = self.get_mother(name_to_discard)
255
+ if name_to_discard_father:
256
+ assert self.get_children(name_to_discard_father)
257
+ self.node_to_children[name_to_discard_father].remove(name_to_discard)
258
+ if name_to_discard_mother:
259
+ assert self.get_children(name_to_discard_mother)
260
+ self.node_to_children[name_to_discard_mother].remove(name_to_discard)
261
+
262
+ name_to_discard_children = set()
263
+ for child in self.get_children(name_to_discard):
264
+ # Merging a parent and child - we will see this when there is inbreeding
265
+ # Note: can’t merge a parent and a child if the child has siblings,
266
+ # because then the child becomes the parent of its siblings (this is
267
+ # handled by check_invalid_parent_child_merge)
268
+ if name_to_keep == child:
269
+ if self.get_data(name_to_keep)["sex"] == "M":
270
+ del self.node_to_father[name_to_keep]
271
+ else:
272
+ del self.node_to_mother[name_to_keep]
273
+ else:
274
+ name_to_discard_children.add(child)
275
+
276
+ for child in name_to_discard_children:
277
+ # This step also handles having the correct name of the merged parents from last loop iteration
278
+ self.add_parent_relation(name_to_keep, child)
279
+
280
+ # Remove all occurrences of name_to_discard in its sibling's sibling sets first
281
+ # so that add_sibling_relation does not add it back in.
282
+ for sibling in self.get_siblings(name_to_discard):
283
+ self.node_to_siblings[sibling].remove(name_to_discard)
284
+ for sibling in self.get_siblings(name_to_discard):
285
+ if sibling != name_to_keep:
286
+ self.add_sibling_relation(sibling, name_to_keep)
287
+
288
+ # Recursively merge parent relations of Node 1 and Node 2
289
+ father1 = self.get_father(name_to_keep)
290
+ father2 = self.get_father(name_to_discard)
291
+ mother1 = self.get_mother(name_to_keep)
292
+ mother2 = self.get_mother(name_to_discard)
293
+ if father1 and father2:
294
+ pair_queue.append((father1, father2))
295
+ elif father2 and father2 != name_to_keep:
296
+ # Set name_to_keep's father to name_to_discard's father
297
+ self.add_parent_relation(father2, name_to_keep)
298
+
299
+ if mother1 and mother2:
300
+ pair_queue.append((mother1, mother2))
301
+ elif mother2 and mother2 != name_to_keep:
302
+ # Set name_to_keep's mother to name_to_discard's mother
303
+ self.add_parent_relation(mother2, name_to_keep)
304
+
305
+ # Update any nodes in the queue whose names might have been changed
306
+ for idx, (node1, node2) in enumerate(pair_queue):
307
+ if node1 == name_to_discard and node2 == name_to_discard:
308
+ pair_queue[idx] = (name_to_keep, name_to_keep)
309
+ elif node1 == name_to_discard:
310
+ pair_queue[idx] = (name_to_keep, node2)
311
+ elif node2 == name_to_discard:
312
+ pair_queue[idx] = (node1, name_to_keep)
313
+
314
+ for data_dict in [
315
+ self.node_to_data,
316
+ self.node_to_father,
317
+ self.node_to_mother,
318
+ self.node_to_children,
319
+ self.node_to_siblings,
320
+ ]:
321
+ if name_to_discard in data_dict:
322
+ del data_dict[name_to_discard]
323
+ return True
324
+
325
+ def check_valid_merge(self, node1: str, node2: str) -> bool:
326
+ """
327
+ Returns True if merging Node 1 and Node 2 (and their ancestors) is a valid operation.
328
+ """
329
+ assert node1 in self.node_to_data and node2 in self.node_to_data
330
+ # Get sets of nodes that would be merged if Node 1 and Node 2 were merged (i.e. ancestors of Node 1 and Node 2)
331
+ # Note that we get sets and not just pairs because of potential inbreeding,
332
+ # for example if one node is both a parent and grandparent of another node
333
+ merge_sets: list[set[str]] = []
334
+ merge_queue: deque[set[str]] = deque([set([node1, node2])])
335
+ included_nodes: set[str] = set()
336
+
337
+ while merge_queue:
338
+ curr_nodes = merge_queue.popleft()
339
+ # If all nodes are the same, skip
340
+ if len(set(curr_nodes)) == 1:
341
+ continue
342
+
343
+ # Update merge sets
344
+ updated = False
345
+ for merge_set in merge_sets:
346
+ if merge_set == curr_nodes:
347
+ updated = True
348
+ break
349
+
350
+ if any(node in merge_set for node in curr_nodes):
351
+ merge_set.update(curr_nodes)
352
+ # Include all merged nodes in the current set of nodes
353
+ curr_nodes = merge_set
354
+ updated = True
355
+ break
356
+ if not updated:
357
+ merge_sets.append(set(curr_nodes))
358
+ included_nodes.update(curr_nodes)
359
+
360
+ # Add parents to the queue
361
+ curr_fathers = set([self.get_father(node) for node in curr_nodes if self.get_father(node)])
362
+ curr_mothers = set([self.get_mother(node) for node in curr_nodes if self.get_mother(node)])
363
+ if len(curr_fathers) > 1 and not curr_fathers.issubset(included_nodes):
364
+ merge_queue.append(curr_fathers)
365
+ if len(curr_mothers) > 1 and not curr_mothers.issubset(included_nodes):
366
+ merge_queue.append(curr_mothers)
367
+
368
+ if self.check_cycle_merge(merge_sets):
369
+ return False
370
+ return True
371
+
372
+ def check_cycle_merge(self, merge_sets: list[set[str]]) -> bool:
373
+ """
374
+ Returns True if merging Node 1 and Node 2 (and their ancestors) would result in a cycle.
375
+ merge_sets is a list of node sets that would be merged if Node 1 and Node 2 were merged.
376
+ """
377
+
378
+ # DFS cycle detection
379
+ def dfs(node) -> bool:
380
+ nodes_to_merge: set[str] | None = None
381
+ for merge_set in merge_sets:
382
+ if node in merge_set:
383
+ nodes_to_merge = merge_set
384
+ break
385
+
386
+ if nodes_to_merge:
387
+ if node in in_progress:
388
+ return True
389
+ if node in explored:
390
+ return False
391
+ in_progress.update(nodes_to_merge)
392
+ for child in [child for node in nodes_to_merge for child in self.get_children(node)]:
393
+ if dfs(child):
394
+ return True
395
+ in_progress.difference_update(nodes_to_merge)
396
+ explored.update(nodes_to_merge)
397
+ else:
398
+ if node in in_progress:
399
+ return True
400
+ if node in explored:
401
+ return False
402
+ in_progress.add(node)
403
+ for child in self.get_children(node):
404
+ if dfs(child):
405
+ return True
406
+ in_progress.remove(node)
407
+ explored.add(node)
408
+ return False
409
+
410
+ explored: set[str] = set()
411
+ in_progress: set[str] = set()
412
+ # Check for cycles starting from each node
413
+ for node in self.node_to_data:
414
+ if dfs(node):
415
+ # Cycle detected
416
+ return True
417
+ return False
418
+
419
+ def fill_node_parents(self, node: str) -> None:
420
+ """
421
+ If the given node doesn't have parents, add placeholder parents.
422
+ If it does, do nothing.
423
+ """
424
+ assert node in self.node_to_data
425
+
426
+ father = self.get_father(node)
427
+ mother = self.get_mother(node)
428
+
429
+ if not father:
430
+ father_id = str(self.num_placeholders)
431
+ self.add_node(
432
+ node_id=father_id,
433
+ sex="M",
434
+ y_haplogroup="*",
435
+ mt_haplogroup="*",
436
+ can_have_children=True,
437
+ can_be_inbred=True,
438
+ years_before_present=math.nan,
439
+ )
440
+
441
+ self.add_parent_relation(father_id, node)
442
+ for sibling in self.get_siblings(node):
443
+ self.add_parent_relation(father_id, sibling)
444
+ self.num_placeholders += 1
445
+
446
+ if not mother:
447
+ mother_id = str(self.num_placeholders)
448
+ self.add_node(
449
+ node_id=mother_id,
450
+ sex="F",
451
+ y_haplogroup="",
452
+ mt_haplogroup="*",
453
+ can_have_children=True,
454
+ can_be_inbred=True,
455
+ years_before_present=math.nan,
456
+ )
457
+
458
+ self.add_parent_relation(mother_id, node)
459
+ for sibling in self.get_siblings(node):
460
+ self.add_parent_relation(mother_id, sibling)
461
+ self.num_placeholders += 1
462
+
463
+ def update_haplogroups(self) -> None:
464
+ """
465
+ Update haplogroups of placeholder nodes.
466
+ """
467
+ for node in self.node_to_data:
468
+ y_haplogroup: str = self.get_data(node)["y_haplogroup"]
469
+ y_lineage: deque[str] = deque(
470
+ [self.get_father(node)]
471
+ + [child for child in self.get_children(node) if self.get_data(child)["sex"] == "M"]
472
+ )
473
+
474
+ while y_lineage:
475
+ curr_node = y_lineage.popleft()
476
+ if (
477
+ not curr_node
478
+ or "*" not in self.get_data(curr_node)["y_haplogroup"]
479
+ or self.get_data(curr_node)["y_haplogroup"].rstrip("*") == y_haplogroup.rstrip("*")
480
+ ):
481
+ continue
482
+ # Overwrite/extend Y haplogroup if it contains a "*" and is a strict subset of the "leaf" haplogroup
483
+ if y_haplogroup.startswith(self.get_data(curr_node)["y_haplogroup"].rstrip("*")):
484
+ self.node_to_data[curr_node]["y_haplogroup"] = (
485
+ y_haplogroup if y_haplogroup[-1] == "*" else y_haplogroup + "*"
486
+ )
487
+ y_lineage.append(self.get_father(curr_node))
488
+ for curr_node_child in self.get_children(curr_node):
489
+ # Only males have Y chromosome
490
+ if self.get_data(curr_node_child)["sex"] == "M":
491
+ y_lineage.append(curr_node_child)
492
+
493
+ mt_haplogroup: str = self.get_data(node)["mt_haplogroup"]
494
+ mt_lineage: deque[str] = deque([self.get_mother(node)])
495
+ # Only females pass on mitochondrial DNA to children
496
+ if self.get_data(node)["sex"] == "F":
497
+ mt_lineage.extend(self.get_children(node))
498
+
499
+ while mt_lineage:
500
+ curr_node = mt_lineage.popleft()
501
+ if (
502
+ not curr_node
503
+ or "*" not in self.get_data(curr_node)["mt_haplogroup"]
504
+ or self.get_data(curr_node)["mt_haplogroup"].rstrip("*") == mt_haplogroup.rstrip("*")
505
+ ):
506
+ continue
507
+ # Overwrite/extend mitochondrial haplogroup if it contains a "*"
508
+ # and is a strict subset of the "leaf" haplogroup
509
+ if mt_haplogroup.startswith(self.get_data(curr_node)["mt_haplogroup"].rstrip("*")):
510
+ self.node_to_data[curr_node]["mt_haplogroup"] = (
511
+ mt_haplogroup if mt_haplogroup[-1] == "*" else mt_haplogroup + "*"
512
+ )
513
+ mt_lineage.append(self.get_mother(curr_node))
514
+ if self.get_data(curr_node)["sex"] == "F":
515
+ mt_lineage.extend(self.get_children(curr_node))
516
+
517
+ def validate_structure(self) -> bool:
518
+ """
519
+ Validates pedigree structure and consistency of internal data.
520
+ """
521
+ for child, father in self.node_to_father.items():
522
+ if child not in self.node_to_children[father]:
523
+ return False
524
+ if child == father:
525
+ return False
526
+
527
+ for child, mother in self.node_to_mother.items():
528
+ if child not in self.node_to_children[mother]:
529
+ return False
530
+ if child == mother:
531
+ return False
532
+
533
+ for parent, children in self.node_to_children.items():
534
+ for child in children:
535
+ if self.get_data(parent)["sex"] == "M":
536
+ if parent != self.node_to_father[child]:
537
+ return False
538
+ else:
539
+ if parent != self.node_to_mother[child]:
540
+ return False
541
+ if parent == child:
542
+ return False
543
+
544
+ for node, siblings in self.node_to_siblings.items():
545
+ for sibling in siblings:
546
+ if node not in self.node_to_siblings[sibling]:
547
+ return False
548
+ if (
549
+ self.node_to_father[node] != self.node_to_father[sibling]
550
+ or self.node_to_mother[node] != self.node_to_mother[sibling]
551
+ ):
552
+ return False
553
+ if node == sibling:
554
+ return False
555
+ return True
556
+
557
+ def validate_members(self, members: set[str]) -> bool:
558
+ """
559
+ Validates this tree based on the member nodes it should contain.
560
+ """
561
+ non_placeholder_nodes = self.get_non_placeholder_nodes()
562
+ # Return False if pedigree doesn't have all the nodes it's supposed to (because of invalid merging)
563
+ return non_placeholder_nodes == members
564
+
565
+ def validate_haplogroups(self) -> bool:
566
+ """
567
+ Validates that all haplogroups are consistent.
568
+ """
569
+
570
+ def haplogroups_agree(haplogroup1: str, haplogroup2: str) -> bool:
571
+ if "*" not in haplogroup1 and "*" not in haplogroup2:
572
+ return haplogroup1 == haplogroup2
573
+ elif "*" in haplogroup1 and "*" in haplogroup2:
574
+ return haplogroup1.startswith(haplogroup2.rstrip("*")) or haplogroup2.startswith(
575
+ haplogroup1.rstrip("*")
576
+ )
577
+ elif "*" in haplogroup1:
578
+ return haplogroup2.startswith(haplogroup1.rstrip("*"))
579
+ else:
580
+ return haplogroup1.startswith(haplogroup2.rstrip("*"))
581
+
582
+ for parent, child in self.get_parent_child_pairs():
583
+ if self.get_data(parent)["sex"] == "F":
584
+ if not haplogroups_agree(self.get_data(parent)["mt_haplogroup"], self.get_data(child)["mt_haplogroup"]):
585
+ return False
586
+ elif self.get_data(parent)["sex"] == "M" and self.get_data(child)["sex"] == "M":
587
+ if not haplogroups_agree(self.get_data(parent)["y_haplogroup"], self.get_data(child)["y_haplogroup"]):
588
+ return False
589
+ return True
590
+
591
+ def validate_can_have_children(self) -> bool:
592
+ """
593
+ Validates that nodes that cannot have children do not have children.
594
+ """
595
+ for node in self.get_non_placeholder_nodes():
596
+ if len(self.get_children(node)) > 0 and not self.get_data(node)["can_have_children"]:
597
+ return False
598
+ return True
599
+
600
+ def validate_inbreeding(self) -> bool:
601
+ """
602
+ Validates that nodes that are known to be not inbred are not inbred.
603
+ """
604
+ related_pairs = self.get_related_pairs()
605
+ for node in self.get_non_placeholder_nodes():
606
+ if not self.get_data(node)["can_be_inbred"]:
607
+ father = self.get_father(node)
608
+ mother = self.get_mother(node)
609
+ if (father, mother) in related_pairs or (mother, father) in related_pairs:
610
+ return False
611
+ return True
612
+
613
+ def validate_years_before_present(self) -> bool:
614
+ """
615
+ Validates that nodes do not postdate their descendants.
616
+ """
617
+ leaf_nodes: list[str] = [node for node in self.node_to_data if not self.get_children(node)]
618
+
619
+ def dfs(node: str, curr_years_before_present: float) -> bool:
620
+ years_before_present = self.get_data(node)["years_before_present"]
621
+ if not math.isnan(years_before_present):
622
+ # Node postdates its descendants
623
+ if years_before_present < curr_years_before_present:
624
+ return False
625
+ else:
626
+ curr_years_before_present = years_before_present
627
+
628
+ if self.get_father(node):
629
+ assert self.get_mother(node)
630
+ if not dfs(self.get_father(node), curr_years_before_present):
631
+ return False
632
+ if not dfs(self.get_mother(node), curr_years_before_present):
633
+ return False
634
+ return True
635
+
636
+ for node in leaf_nodes:
637
+ if not dfs(node, float("-inf")):
638
+ return False
639
+ return True
640
+
641
+ def validate_forced_constraints(
642
+ self, pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]]
643
+ ) -> bool:
644
+ """
645
+ Validates that forced constraints so far are present in the pedigree.
646
+ Note: Additional relations between two nodes are allowed as long as the forced constraints are present.
647
+ """
648
+ for (node1, node2), degree_constraints in pair_to_relations_so_far.items():
649
+ for _, constraints, force_constraints in degree_constraints:
650
+ if force_constraints and not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
651
+ return False
652
+ return True
653
+
654
+ def count_inconsistencies(
655
+ self,
656
+ pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]],
657
+ pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]],
658
+ check_half_siblings: bool,
659
+ ) -> tuple[int, list[tuple[str, str, str, str]]]:
660
+ """
661
+ Validates this tree based on the input relation data.
662
+ If check_half_siblings is False, don't check for extraneous half-sibling relations
663
+ because the 2 non-shared parents might be merged later.
664
+ Returns count of inconsistencies with the input data as well as a log of the inconsistencies.
665
+ Note: pair_to_constraints values must be sorted by increasing length
666
+ so that specific constraints are checked first.
667
+ """
668
+ for node1, node2 in pair_to_constraints:
669
+ # Ensure no duplicate/symmetric entries
670
+ assert (node2, node1) not in pair_to_constraints
671
+ # Marks which entries in pair_to_constraints have been seen/used
672
+ pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
673
+
674
+ def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
675
+ if (node1, node2) in pair_to_constraints:
676
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
677
+ if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
678
+ return True
679
+ return False
680
+
681
+ def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
682
+ if (node1, node2) in pair_to_constraints:
683
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
684
+ if relation in constraints:
685
+ pair_to_constraints_seen_entries[(node1, node2)].add(idx)
686
+ break
687
+
688
+ def validate_relation(
689
+ node1: str, node2: str, relation: str, strike_log: list[tuple[str, str, str, str]]
690
+ ) -> None:
691
+ relation_to_degree = {
692
+ "parent-child": "1",
693
+ "child-parent": "1",
694
+ "siblings": "1",
695
+ "maternal aunt/uncle-nephew/niece": "2",
696
+ "maternal nephew/niece-aunt/uncle": "2",
697
+ "paternal aunt/uncle-nephew/niece": "2",
698
+ "paternal nephew/niece-aunt/uncle": "2",
699
+ "maternal grandparent-grandchild": "2",
700
+ "maternal grandchild-grandparent": "2",
701
+ "paternal grandparent-grandchild": "2",
702
+ "paternal grandchild-grandparent": "2",
703
+ "maternal half-siblings": "2",
704
+ "paternal half-siblings": "2",
705
+ "double cousins": "2",
706
+ }
707
+ flipped_relations = {
708
+ "parent-child": "child-parent",
709
+ "child-parent": "parent-child",
710
+ "maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
711
+ "paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
712
+ "maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
713
+ "paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
714
+ "maternal grandparent-grandchild": "maternal grandchild-grandparent",
715
+ "paternal grandparent-grandchild": "paternal grandchild-grandparent",
716
+ "maternal grandchild-grandparent": "maternal grandparent-grandchild",
717
+ "paternal grandchild-grandparent": "paternal grandparent-grandchild",
718
+ "siblings": "siblings", # Symmetric
719
+ "maternal half-siblings": "maternal half-siblings", # Symmetric
720
+ "paternal half-siblings": "paternal half-siblings", # Symmetric
721
+ "double cousins": "double cousins", # Symmetric
722
+ }
723
+ if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
724
+ node2, node1, flipped_relations[relation]
725
+ ):
726
+ if node1 < node2:
727
+ strike_log.append((node1, node2, f"+{relation_to_degree[relation]}", ""))
728
+ else:
729
+ strike_log.append((node2, node1, f"+{relation_to_degree[relation]}", ""))
730
+ remove_relation_from_input_data(node1, node2, relation)
731
+ remove_relation_from_input_data(node2, node1, flipped_relations[relation])
732
+
733
+ strike_log: list[tuple[str, str, str, str]] = [] # (node1, node2, +/- relation degree, constraints)
734
+ # Check that relations in the pedigree are present in the input data
735
+ for parent, child in self.get_parent_child_pairs(include_placeholders=False):
736
+ validate_relation(parent, child, "parent-child", strike_log)
737
+ for sibling1, sibling2 in self.get_sibling_pairs(include_placeholders=False):
738
+ validate_relation(sibling1, sibling2, "siblings", strike_log)
739
+
740
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
741
+ include_placeholders=False, shared_relative_sex="F"
742
+ ):
743
+ validate_relation(aunt_uncle, nephew_niece, "maternal aunt/uncle-nephew/niece", strike_log)
744
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
745
+ include_placeholders=False, shared_relative_sex="M"
746
+ ):
747
+ validate_relation(aunt_uncle, nephew_niece, "paternal aunt/uncle-nephew/niece", strike_log)
748
+
749
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
750
+ include_placeholders=False, shared_relative_sex="F"
751
+ ):
752
+ validate_relation(grandparent, grandchild, "maternal grandparent-grandchild", strike_log)
753
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
754
+ include_placeholders=False, shared_relative_sex="M"
755
+ ):
756
+ validate_relation(grandparent, grandchild, "paternal grandparent-grandchild", strike_log)
757
+ for double_cousin1, double_cousin2 in self.get_double_cousin_pairs(include_placeholders=False):
758
+ validate_relation(double_cousin1, double_cousin2, "double cousins", strike_log)
759
+
760
+ if check_half_siblings:
761
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
762
+ include_placeholders=False, shared_relative_sex="F"
763
+ ):
764
+ validate_relation(half_sibling1, half_sibling2, "maternal half-siblings", strike_log)
765
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
766
+ include_placeholders=False, shared_relative_sex="M"
767
+ ):
768
+ validate_relation(half_sibling1, half_sibling2, "paternal half-siblings", strike_log)
769
+
770
+ # Check for "dropped" input relations
771
+ # Note: We use constrained relations instead of all relations because we want to catch half-siblings
772
+ # that explicitly should be some other relation even when check_half_siblings is False
773
+ # The purpose of check_half_siblings is to avoid marking *incidental* half-siblings,
774
+ # not half-siblings that should be something else
775
+ for (node1, node2), degrees_constraints in pair_to_relations_so_far.items():
776
+ # If only one input relation between these two nodes, simple check is much faster
777
+ if len(degrees_constraints) == 1:
778
+ degree, constraints, _ = degrees_constraints[0]
779
+ if not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
780
+ strike_log.append((node1, node2, f"-{degree}", constraints))
781
+ else:
782
+ pedigree_shared_relations: defaultdict[str, int] = self.get_relations_between_nodes(
783
+ node1, node2, include_maternal_paternal=True
784
+ )
785
+ for degree, constraints, _ in degrees_constraints:
786
+ present_flag = False
787
+ for constraint in constraints.split(";"):
788
+ if constraint in pedigree_shared_relations:
789
+ present_flag = True
790
+ pedigree_shared_relations[constraint] -= 1
791
+ if pedigree_shared_relations[constraint] == 0:
792
+ del pedigree_shared_relations[constraint]
793
+ break
794
+ if not present_flag:
795
+ strike_log.append((node1, node2, f"-{degree}", constraints))
796
+
797
+ # Count # of strikes (will not equal len(strike_log) because we don't want to double-count *changed* relations)
798
+ strike_count: int = 0
799
+ node_pair_strike_balances: defaultdict[tuple[str, str], int] = defaultdict(int)
800
+ for node1, node2, strike, _ in strike_log:
801
+ if strike[0] == "+":
802
+ if node_pair_strike_balances[(node1, node2)] >= 0:
803
+ strike_count += 1
804
+ node_pair_strike_balances[(node1, node2)] += 1
805
+ node_pair_strike_balances[(node2, node1)] += 1
806
+
807
+ elif strike[0] == "-":
808
+ if node_pair_strike_balances[(node1, node2)] <= 0:
809
+ strike_count += 1
810
+ node_pair_strike_balances[(node1, node2)] -= 1
811
+ node_pair_strike_balances[(node2, node1)] -= 1
812
+ return strike_count, strike_log
813
+
814
+ def count_third_degree_inconsistencies(
815
+ self, pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]]
816
+ ) -> int:
817
+ """
818
+ Counts only one-sided inconsistencies in third-degree relations.
819
+ Used as a "tie-breaker" for 1st- and 2nd-degree inconsistences.
820
+ """
821
+ for node1, node2 in pair_to_constraints:
822
+ # Ensure no duplicate/symmetric entries
823
+ assert (node2, node1) not in pair_to_constraints
824
+ # Marks which entries in pair_to_constraints have been seen/used
825
+ pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
826
+
827
+ def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
828
+ if (node1, node2) in pair_to_constraints:
829
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
830
+ if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
831
+ return True
832
+ return False
833
+
834
+ def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
835
+ if (node1, node2) in pair_to_constraints:
836
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
837
+ if relation in constraints:
838
+ pair_to_constraints_seen_entries[(node1, node2)].add(idx)
839
+ break
840
+
841
+ def validate_relation(node1: str, node2: str, relation: str) -> bool:
842
+ flipped_relations = {
843
+ "half aunt/uncle-half nephew/niece": "half nephew/niece-half aunt/uncle",
844
+ "half nephew/niece-half aunt/uncle": "half aunt/uncle-half nephew/niece",
845
+ "greatgrandparent-greatgrandchild": "greatgrandchild-greatgrandparent",
846
+ "greatgrandchild-greatgrandparent": "greatgrandparent-greatgrandchild",
847
+ "grandaunt/granduncle-grandnephew/grandniece": "grandnephew/grandniece-grandaunt/granduncle",
848
+ "grandnephew/grandniece-grandaunt/granduncle": "grandaunt/granduncle-grandnephew/grandniece",
849
+ "first cousins": "first cousins", # Symmetric
850
+ }
851
+ ret = False
852
+ if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
853
+ node2, node1, flipped_relations[relation]
854
+ ):
855
+ ret = True
856
+ remove_relation_from_input_data(node1, node2, relation)
857
+ remove_relation_from_input_data(node2, node1, flipped_relations[relation])
858
+ return ret
859
+
860
+ # Double cousins are also twice-first cousins, so don't count the first cousin relations separately
861
+ accounted_cousin_pairs: defaultdict[tuple[str, str], int] = defaultdict(int)
862
+ for double_cousin1, double_cousin2 in self.get_double_cousin_pairs(include_placeholders=False):
863
+ accounted_cousin_pairs[(double_cousin1, double_cousin2)] += 2
864
+ accounted_cousin_pairs[(double_cousin2, double_cousin1)] += 2
865
+
866
+ strike_count: int = 0
867
+ for half_aunt_uncle, half_nephew_niece in self.get_half_aunt_uncle_nephew_niece_pairs(
868
+ include_placeholders=False
869
+ ):
870
+ strike_count += validate_relation(half_aunt_uncle, half_nephew_niece, "half aunt/uncle-half nephew/niece")
871
+ for greatgrandparent, greatgrandchild in self.get_greatgrandparent_greatgrandchild_pairs(
872
+ include_placeholders=False
873
+ ):
874
+ strike_count += validate_relation(greatgrandparent, greatgrandchild, "greatgrandparent-greatgrandchild")
875
+ for grandaunt_granduncle, grandnephew_grandniece in self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(
876
+ include_placeholders=False
877
+ ):
878
+ strike_count += validate_relation(
879
+ grandaunt_granduncle, grandnephew_grandniece, "grandaunt/granduncle-grandnephew/grandniece"
880
+ )
881
+ for first_cousin1, first_cousin2 in self.get_first_cousin_pairs(include_placeholders=False):
882
+ # Only count cousin relation if not already accounted for by a double cousin relation
883
+ if accounted_cousin_pairs[(first_cousin1, first_cousin2)] > 0:
884
+ accounted_cousin_pairs[(first_cousin1, first_cousin2)] -= 1
885
+ accounted_cousin_pairs[(first_cousin2, first_cousin1)] -= 1
886
+ else:
887
+ strike_count += validate_relation(first_cousin1, first_cousin2, "first cousins")
888
+ return strike_count
889
+
890
+ def is_relation_in_pedigree(self, node1: str, node2: str, relations_list: list[str]) -> bool:
891
+ """
892
+ Returns True if *any* of the relations in relations_list are present between node1 and node2 in the pedigree.
893
+ """
894
+ assert node1 in self.node_to_data and node2 in self.node_to_data
895
+
896
+ for relation in relations_list:
897
+ if relation == "parent-child":
898
+ if node2 in self.get_children(node1):
899
+ return True
900
+ if relation == "child-parent":
901
+ if node1 in self.get_children(node2):
902
+ return True
903
+ if relation == "siblings":
904
+ if node2 in self.get_siblings(node1):
905
+ assert node1 in self.get_siblings(node2)
906
+ return True
907
+
908
+ if relation == "aunt/uncle-nephew/niece":
909
+ for sibling in self.get_siblings(node1):
910
+ if node2 in self.get_children(sibling):
911
+ return True
912
+ if relation == "nephew/niece-aunt/uncle":
913
+ for sibling in self.get_siblings(node2):
914
+ if node1 in self.get_children(sibling):
915
+ return True
916
+ if relation == "grandparent-grandchild":
917
+ for child in self.get_children(node1):
918
+ if node2 in self.get_children(child):
919
+ return True
920
+ if relation == "grandchild-grandparent":
921
+ for child in self.get_children(node2):
922
+ if node1 in self.get_children(child):
923
+ return True
924
+ if relation == "half-siblings":
925
+ if self.get_father(node2):
926
+ if node1 in self.get_children(self.get_father(node2)) and self.get_mother(node1) != self.get_mother(
927
+ node2
928
+ ):
929
+ return True
930
+ if self.get_mother(node2):
931
+ if node1 in self.get_children(self.get_mother(node2)) and self.get_father(node1) != self.get_father(
932
+ node2
933
+ ):
934
+ return True
935
+ if relation == "double cousins":
936
+ father1 = self.get_father(node1)
937
+ mother1 = self.get_mother(node1)
938
+ father2 = self.get_father(node2)
939
+ mother2 = self.get_mother(node2)
940
+ if not father1 or not mother1 or not father2 or not mother2:
941
+ continue
942
+ fathers_are_siblings = father2 in self.get_siblings(father1)
943
+ mothers_are_siblings = mother1 in self.get_siblings(mother2)
944
+ cross_parents_are_siblings = father2 in self.get_siblings(mother1) and father1 in self.get_siblings(
945
+ mother2
946
+ )
947
+ if (fathers_are_siblings and mothers_are_siblings) or cross_parents_are_siblings:
948
+ return True
949
+
950
+ if relation == "maternal aunt/uncle-nephew/niece":
951
+ for sibling in self.get_siblings(node1):
952
+ if self.get_data(sibling)["sex"] == "F" and node2 in self.get_children(sibling):
953
+ return True
954
+ if relation == "paternal aunt/uncle-nephew/niece":
955
+ for sibling in self.get_siblings(node1):
956
+ if self.get_data(sibling)["sex"] == "M" and node2 in self.get_children(sibling):
957
+ return True
958
+ if relation == "maternal nephew/niece-aunt/uncle":
959
+ for sibling in self.get_siblings(node2):
960
+ if self.get_data(sibling)["sex"] == "F" and node1 in self.get_children(sibling):
961
+ return True
962
+ if relation == "paternal nephew/niece-aunt/uncle":
963
+ for sibling in self.get_siblings(node2):
964
+ if self.get_data(sibling)["sex"] == "M" and node1 in self.get_children(sibling):
965
+ return True
966
+
967
+ if relation == "maternal grandparent-grandchild":
968
+ for child in self.get_children(node1):
969
+ if self.get_data(child)["sex"] == "F" and node2 in self.get_children(child):
970
+ return True
971
+ if relation == "paternal grandparent-grandchild":
972
+ for child in self.get_children(node1):
973
+ if self.get_data(child)["sex"] == "M" and node2 in self.get_children(child):
974
+ return True
975
+ if relation == "maternal grandchild-grandparent":
976
+ for child in self.get_children(node2):
977
+ if self.get_data(child)["sex"] == "F" and node1 in self.get_children(child):
978
+ return True
979
+ if relation == "paternal grandchild-grandparent":
980
+ for child in self.get_children(node2):
981
+ if self.get_data(child)["sex"] == "M" and node1 in self.get_children(child):
982
+ return True
983
+
984
+ if relation == "paternal half-siblings":
985
+ if self.get_father(node2):
986
+ if node1 in self.get_children(self.get_father(node2)) and self.get_mother(node1) != self.get_mother(
987
+ node2
988
+ ):
989
+ return True
990
+ if relation == "maternal half-siblings":
991
+ if self.get_mother(node2):
992
+ if node1 in self.get_children(self.get_mother(node2)) and self.get_father(node1) != self.get_father(
993
+ node2
994
+ ):
995
+ return True
996
+ return False
997
+
998
+ def get_relations_between_nodes(
999
+ self, node1: str, node2: str, include_maternal_paternal: bool = False
1000
+ ) -> defaultdict[str, int]:
1001
+ """
1002
+ Returns a dictionary of the *1st- and 2nd-degree* relations between node1 and node2.
1003
+ """
1004
+ relations: defaultdict[str, int] = defaultdict(int)
1005
+ if self.is_relation_in_pedigree(node1, node2, ["parent-child"]):
1006
+ relations["parent-child"] += 1
1007
+ if self.is_relation_in_pedigree(node1, node2, ["child-parent"]):
1008
+ relations["child-parent"] += 1
1009
+ if self.is_relation_in_pedigree(node1, node2, ["siblings"]):
1010
+ relations["siblings"] += 1
1011
+
1012
+ if self.is_relation_in_pedigree(node1, node2, ["maternal aunt/uncle-nephew/niece"]):
1013
+ relations["maternal aunt/uncle-nephew/niece"] += 1
1014
+ if self.is_relation_in_pedigree(node1, node2, ["paternal aunt/uncle-nephew/niece"]):
1015
+ relations["paternal aunt/uncle-nephew/niece"] += 1
1016
+ if self.is_relation_in_pedigree(node1, node2, ["maternal nephew/niece-aunt/uncle"]):
1017
+ relations["maternal nephew/niece-aunt/uncle"] += 1
1018
+ if self.is_relation_in_pedigree(node1, node2, ["paternal nephew/niece-aunt/uncle"]):
1019
+ relations["paternal nephew/niece-aunt/uncle"] += 1
1020
+
1021
+ if self.is_relation_in_pedigree(node1, node2, ["maternal grandparent-grandchild"]):
1022
+ relations["maternal grandparent-grandchild"] += 1
1023
+ if self.is_relation_in_pedigree(node1, node2, ["paternal grandparent-grandchild"]):
1024
+ relations["paternal grandparent-grandchild"] += 1
1025
+ if self.is_relation_in_pedigree(node1, node2, ["maternal grandchild-grandparent"]):
1026
+ relations["maternal grandchild-grandparent"] += 1
1027
+ if self.is_relation_in_pedigree(node1, node2, ["paternal grandchild-grandparent"]):
1028
+ relations["paternal grandchild-grandparent"] += 1
1029
+
1030
+ if self.is_relation_in_pedigree(node1, node2, ["maternal half-siblings"]):
1031
+ relations["maternal half-siblings"] += 1
1032
+ if self.is_relation_in_pedigree(node1, node2, ["paternal half-siblings"]):
1033
+ relations["paternal half-siblings"] += 1
1034
+
1035
+ if self.is_relation_in_pedigree(node1, node2, ["double cousins"]):
1036
+ relations["double cousins"] += 1
1037
+
1038
+ if not include_maternal_paternal:
1039
+ relations["aunt/uncle-nephew/niece"] = (
1040
+ relations["maternal aunt/uncle-nephew/niece"] + relations["paternal aunt/uncle-nephew/niece"]
1041
+ )
1042
+ relations["nephew/niece-aunt/uncle"] = (
1043
+ relations["maternal nephew/niece-aunt/uncle"] + relations["paternal nephew/niece-aunt/uncle"]
1044
+ )
1045
+ relations["grandparent-grandchild"] = (
1046
+ relations["maternal grandparent-grandchild"] + relations["paternal grandparent-grandchild"]
1047
+ )
1048
+ relations["grandchild-grandparent"] = (
1049
+ relations["maternal grandchild-grandparent"] + relations["paternal grandchild-grandparent"]
1050
+ )
1051
+ relations["half-siblings"] = relations["maternal half-siblings"] + relations["paternal half-siblings"]
1052
+ for relation in [
1053
+ "maternal aunt/uncle-nephew/niece",
1054
+ "paternal aunt/uncle-nephew/niece",
1055
+ "maternal nephew/niece-aunt/uncle",
1056
+ "paternal nephew/niece-aunt/uncle",
1057
+ "maternal grandparent-grandchild",
1058
+ "paternal grandparent-grandchild",
1059
+ "maternal grandchild-grandparent",
1060
+ "paternal grandchild-grandparent",
1061
+ "maternal half-siblings",
1062
+ "paternal half-siblings",
1063
+ ]:
1064
+ del relations[relation]
1065
+
1066
+ relations_to_remove = set()
1067
+ for relation, count in relations.items():
1068
+ if count == 0:
1069
+ relations_to_remove.add(relation)
1070
+ for relation in relations_to_remove:
1071
+ del relations[relation]
1072
+ return relations
1073
+
1074
+ def get_parent_child_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1075
+ """
1076
+ Gets all (parent, child) pairs in the tree.
1077
+ """
1078
+ parent_child_pairs: list[tuple[str, str]] = []
1079
+ for parent in self.node_to_children:
1080
+ for child in self.get_children(parent):
1081
+ if include_placeholders or (not parent.isnumeric() and not child.isnumeric()):
1082
+ parent_child_pairs.append((parent, child))
1083
+ return parent_child_pairs
1084
+
1085
+ def get_sibling_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1086
+ """
1087
+ Gets all (sibling, sibling) pairs in the tree.
1088
+ Note: Only gets *full* siblings. See self.get_half_sibling_pairs().
1089
+ """
1090
+ sibling_pairs: list[tuple[str, str]] = []
1091
+ for sibling1 in self.node_to_siblings:
1092
+ for sibling2 in self.get_siblings(sibling1):
1093
+ if include_placeholders or (not sibling1.isnumeric() and not sibling2.isnumeric()):
1094
+ # Don't add symmetric duplicates
1095
+ if (sibling2, sibling1) not in sibling_pairs:
1096
+ sibling_pairs.append((sibling1, sibling2))
1097
+ return sibling_pairs
1098
+
1099
+ def get_aunt_uncle_nephew_niece_pairs(
1100
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
1101
+ ) -> list[tuple[str, str]]:
1102
+ """
1103
+ Gets all (aunt/uncle, nephew/niece) pairs in the tree.
1104
+ Includes duplicates if, for example, shared_relative_sex=None and an aunt is
1105
+ both a maternal and paternal aunt to a nephew (i.e., full-sib mating).
1106
+ """
1107
+ aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
1108
+ for parent, child in self.get_parent_child_pairs():
1109
+ for parent_sibling in self.get_siblings(parent):
1110
+ if not shared_relative_sex or self.get_data(parent)["sex"] == shared_relative_sex:
1111
+ if include_placeholders or (not parent_sibling.isnumeric() and not child.isnumeric()):
1112
+ aunt_uncle_nephew_niece_pairs.append((parent_sibling, child))
1113
+ return aunt_uncle_nephew_niece_pairs
1114
+
1115
+ def get_grandparent_grandchild_pairs(
1116
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
1117
+ ) -> list[tuple[str, str]]:
1118
+ """
1119
+ Gets all (grandparent, grandchild) pairs in the tree.
1120
+ Includes duplicates if, for example, a grandparent is both a maternal and paternal grandparent to a grandchild.
1121
+ """
1122
+ grandparent_grandchild_pairs: list[tuple[str, str]] = []
1123
+ for parent, child in self.get_parent_child_pairs():
1124
+ for child_child in self.get_children(child):
1125
+ if not shared_relative_sex or self.get_data(child)["sex"] == shared_relative_sex:
1126
+ if include_placeholders or (not parent.isnumeric() and not child_child.isnumeric()):
1127
+ grandparent_grandchild_pairs.append((parent, child_child))
1128
+ return grandparent_grandchild_pairs
1129
+
1130
+ def get_half_sibling_pairs(
1131
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
1132
+ ) -> list[tuple[str, str]]:
1133
+ """
1134
+ Gets all (half-sibling, half-sibling) pairs in the tree.
1135
+ """
1136
+ half_sibling_pairs: list[tuple[str, str]] = []
1137
+ for parent, child in self.get_parent_child_pairs():
1138
+ for other_child in self.get_children(parent):
1139
+ if child != other_child and other_child not in self.get_siblings(child):
1140
+ if not shared_relative_sex or self.get_data(parent)["sex"] == shared_relative_sex:
1141
+ if include_placeholders or (not child.isnumeric() and not other_child.isnumeric()):
1142
+ # Don't add symmetric duplicates
1143
+ if (other_child, child) not in half_sibling_pairs:
1144
+ half_sibling_pairs.append((child, other_child))
1145
+ return half_sibling_pairs
1146
+
1147
+ def get_double_cousin_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1148
+ """
1149
+ Gets all (double cousin, double cousin) pairs in the tree.
1150
+ """
1151
+ double_cousin_pairs: list[tuple[str, str]] = []
1152
+ for cousin1, cousin2 in self.get_first_cousin_pairs(include_placeholders=include_placeholders):
1153
+ father1, mother1 = self.get_father(cousin1), self.get_mother(cousin1)
1154
+ father2, mother2 = self.get_father(cousin2), self.get_mother(cousin2)
1155
+ # Need both parents to be known to determine double cousins
1156
+ if not father1 or not mother1 or not father2 or not mother2:
1157
+ continue
1158
+
1159
+ fathers_are_siblings = father2 in self.get_siblings(father1)
1160
+ mothers_are_siblings = mother2 in self.get_siblings(mother1)
1161
+ cross_parent_siblings = mother2 in self.get_siblings(father1) and father2 in self.get_siblings(mother1)
1162
+
1163
+ if fathers_are_siblings and mothers_are_siblings:
1164
+ if (cousin2, cousin1) not in double_cousin_pairs:
1165
+ double_cousin_pairs.append((cousin1, cousin2))
1166
+
1167
+ if cross_parent_siblings:
1168
+ if (cousin2, cousin1) not in double_cousin_pairs:
1169
+ double_cousin_pairs.append((cousin1, cousin2))
1170
+ return double_cousin_pairs
1171
+
1172
+ def get_half_aunt_uncle_nephew_niece_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1173
+ """
1174
+ Gets all (half-aunt/half-uncle, half-nephew/half-niece) pairs in the tree.
1175
+ """
1176
+ half_aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
1177
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs():
1178
+ for half_sibling1_child in self.get_children(half_sibling1):
1179
+ if half_sibling1_child != half_sibling2:
1180
+ if include_placeholders or (not half_sibling2.isnumeric() and not half_sibling1_child.isnumeric()):
1181
+ half_aunt_uncle_nephew_niece_pairs.append((half_sibling2, half_sibling1_child))
1182
+
1183
+ for half_sibling2_child in self.get_children(half_sibling2):
1184
+ if half_sibling2_child != half_sibling1:
1185
+ if include_placeholders or (not half_sibling1.isnumeric() and not half_sibling2_child.isnumeric()):
1186
+ half_aunt_uncle_nephew_niece_pairs.append((half_sibling1, half_sibling2_child))
1187
+ return half_aunt_uncle_nephew_niece_pairs
1188
+
1189
+ def get_greatgrandparent_greatgrandchild_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1190
+ """
1191
+ Gets all (greatgrandparent, greatgrandchild) pairs in the tree.
1192
+ """
1193
+ greatgrandparent_greatgrandchild_pairs: list[tuple[str, str]] = []
1194
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs():
1195
+ for grandchild_child in self.get_children(grandchild):
1196
+ if include_placeholders or (not grandparent.isnumeric() and not grandchild_child.isnumeric()):
1197
+ greatgrandparent_greatgrandchild_pairs.append((grandparent, grandchild_child))
1198
+ return greatgrandparent_greatgrandchild_pairs
1199
+
1200
+ def get_grandaunt_granduncle_grandnephew_grandniece_pairs(
1201
+ self, include_placeholders: bool = True
1202
+ ) -> list[tuple[str, str]]:
1203
+ """
1204
+ Gets all (grandaunt/uncle, grandnephew/niece) pairs in the tree.
1205
+ """
1206
+ grandaunt_granduncle_grandnephew_grandniece_pairs: list[tuple[str, str]] = []
1207
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs():
1208
+ for nephew_niece_child in self.get_children(nephew_niece):
1209
+ if include_placeholders or (not aunt_uncle.isnumeric() and not nephew_niece_child.isnumeric()):
1210
+ grandaunt_granduncle_grandnephew_grandniece_pairs.append((aunt_uncle, nephew_niece_child))
1211
+ return grandaunt_granduncle_grandnephew_grandniece_pairs
1212
+
1213
+ def get_first_cousin_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1214
+ """
1215
+ Gets all (first cousin, first cousin) pairs in the tree.
1216
+ """
1217
+ cousin_pairs: list[tuple[str, str]] = []
1218
+ for aunt_uncle, child in self.get_aunt_uncle_nephew_niece_pairs():
1219
+ for aunt_uncle_child in self.get_children(aunt_uncle):
1220
+ if include_placeholders or (not child.isnumeric() and not aunt_uncle_child.isnumeric()):
1221
+ # Don't add symmetric duplicates
1222
+ if aunt_uncle_child != child and (aunt_uncle_child, child) not in cousin_pairs:
1223
+ cousin_pairs.append((child, aunt_uncle_child))
1224
+ return cousin_pairs
1225
+
1226
+ def get_related_pairs(self, include_placeholders: bool = True) -> set[tuple[str, str]]:
1227
+ """
1228
+ Gets all related pairs (up to and including 3rd-degree relations) in the pedigree.
1229
+ """
1230
+ related_pairs: set[tuple[str, str]] = set()
1231
+ related_pairs.update(self.get_parent_child_pairs(include_placeholders=include_placeholders))
1232
+ related_pairs.update(self.get_sibling_pairs(include_placeholders=include_placeholders))
1233
+ related_pairs.update(self.get_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
1234
+ related_pairs.update(self.get_grandparent_grandchild_pairs(include_placeholders=include_placeholders))
1235
+ related_pairs.update(self.get_half_sibling_pairs(include_placeholders=include_placeholders))
1236
+ related_pairs.update(self.get_half_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
1237
+ related_pairs.update(self.get_greatgrandparent_greatgrandchild_pairs(include_placeholders=include_placeholders))
1238
+ related_pairs.update(
1239
+ self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(include_placeholders=include_placeholders)
1240
+ )
1241
+ related_pairs.update(self.get_first_cousin_pairs(include_placeholders=include_placeholders))
1242
+ return related_pairs
1243
+
1244
+ def get_non_placeholder_nodes(self) -> set[str]:
1245
+ """
1246
+ Gets all non-placeholder nodes in the tree.
1247
+ """
1248
+ return set([node for node in self.node_to_data if not node.isnumeric()])
1249
+
1250
+ def clean_data(self) -> None:
1251
+ """
1252
+ Remove any empty entries in the relation dictionaries.
1253
+ Also remove unnecessary placeholder nodes to standardize topological sort output.
1254
+ """
1255
+ placeholder_nodes_to_remove: set[str] = set()
1256
+ for node in self.node_to_data:
1257
+ mother = self.get_mother(node)
1258
+ father = self.get_father(node)
1259
+ if mother.isnumeric() and father.isnumeric():
1260
+ if len(self.get_children(mother)) == 1 and len(self.get_children(father)) == 1:
1261
+ if (
1262
+ not self.get_mother(mother)
1263
+ and not self.get_father(mother)
1264
+ and not self.get_mother(father)
1265
+ and not self.get_father(father)
1266
+ ):
1267
+ placeholder_nodes_to_remove.add(mother)
1268
+ placeholder_nodes_to_remove.add(father)
1269
+
1270
+ for node in placeholder_nodes_to_remove:
1271
+ for data_dict in [
1272
+ self.node_to_data,
1273
+ self.node_to_father,
1274
+ self.node_to_mother,
1275
+ self.node_to_children,
1276
+ self.node_to_siblings,
1277
+ ]:
1278
+ if node in data_dict:
1279
+ del data_dict[node]
1280
+
1281
+ for node in self.node_to_data:
1282
+ assert node not in self.get_siblings(node) and node not in self.get_children(node)
1283
+ if self.get_father(node) in placeholder_nodes_to_remove:
1284
+ del self.node_to_father[node]
1285
+ if self.get_mother(node) in placeholder_nodes_to_remove:
1286
+ del self.node_to_mother[node]
1287
+
1288
+ for relation_dict in [self.node_to_father, self.node_to_mother, self.node_to_children, self.node_to_siblings]:
1289
+ keys_to_remove = set()
1290
+ for k, v in relation_dict.items():
1291
+ if not v:
1292
+ keys_to_remove.add(k)
1293
+ for key_to_remove in keys_to_remove:
1294
+ del relation_dict[key_to_remove]
1295
+
1296
+ def plot(
1297
+ self,
1298
+ path: str,
1299
+ mt_haplogroup_to_color: dict[str, str] | dict[str, tuple[float, float, float, float]] | None = None,
1300
+ nodes_to_remove: list[str] | None = None,
1301
+ edges_to_remove: list[tuple[str, str]] | None = None,
1302
+ dotted_edges_to_add: list[tuple[str, str]] | None = None,
1303
+ plot_haplogroups: bool = True,
1304
+ font_size: float | None = None,
1305
+ ) -> None:
1306
+ """
1307
+ Plot the pedigree to the given path. Optionally takes a custom mapping of mt_haplogroups to colors.
1308
+ Also optionally takes arguments to plot uncertain relations.
1309
+ nodes_to_remove is a list of nodes to remove from the plot.
1310
+ edges_to_remove is a list of parent-child edges to remove from the plot.
1311
+ dotted_edges_to_add is a list of node pairs to plot as dotted lines.
1312
+ These arguments can be used in conjunction to replace uncertain relations with dotted lines.
1313
+ """
1314
+ if not importlib.util.find_spec("pygraphviz"):
1315
+ raise ImportError("Plotting pedigree requires PyGraphviz (https://pygraphviz.github.io/).")
1316
+
1317
+ tree = nx.from_dict_of_lists(self.node_to_children, create_using=nx.DiGraph)
1318
+ # Add childless nodes
1319
+ for node in self.node_to_data:
1320
+ if node not in tree.nodes:
1321
+ tree.add_node(node)
1322
+
1323
+ # Replace relations with dotted edges
1324
+ if nodes_to_remove:
1325
+ tree.remove_nodes_from(nodes_to_remove)
1326
+ if edges_to_remove:
1327
+ tree.remove_edges_from(edges_to_remove)
1328
+ if dotted_edges_to_add:
1329
+ tree.add_edges_from(dotted_edges_to_add, style="dotted")
1330
+ parent_child_edges = [
1331
+ (u, v) for u, v, style in tree.edges.data("style", default="parent_child") if style == "parent_child"
1332
+ ]
1333
+ dotted_edges = [(u, v) for u, v, style in tree.edges.data("style", default="parent_child") if style == "dotted"]
1334
+
1335
+ male_named_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "M" and not node.isnumeric()]
1336
+ male_placeholder_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "M" and node.isnumeric()]
1337
+ female_named_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "F" and not node.isnumeric()]
1338
+ female_placeholder_nodes = [
1339
+ node for node in tree.nodes if self.get_data(node)["sex"] == "F" and node.isnumeric()
1340
+ ]
1341
+
1342
+ node_labels = dict()
1343
+ for node in tree.nodes:
1344
+ mt_haplogroup = self.get_data(node)["mt_haplogroup"].replace("*", "")[:3]
1345
+ y_haplogroup = self.get_data(node)["y_haplogroup"].replace("*", "")[:3]
1346
+ if node.isnumeric():
1347
+ if not plot_haplogroups:
1348
+ node_labels[node] = ""
1349
+ elif y_haplogroup:
1350
+ node_labels[node] = f"MT: {mt_haplogroup}\nY: {y_haplogroup}"
1351
+ else:
1352
+ node_labels[node] = f"MT: {mt_haplogroup}"
1353
+ else:
1354
+ if not plot_haplogroups:
1355
+ node_labels[node] = node
1356
+ elif y_haplogroup:
1357
+ node_labels[node] = f"{node}\nMT: {mt_haplogroup}\nY: {y_haplogroup}"
1358
+ else:
1359
+ node_labels[node] = f"{node}\nMT: {mt_haplogroup}"
1360
+
1361
+ # Create colormap for MT haplogroups
1362
+ if not mt_haplogroup_to_color:
1363
+ cmap = plt.get_cmap("tab20")
1364
+ mt_haplogroups = sorted(
1365
+ set(
1366
+ [
1367
+ self.get_data(node)["mt_haplogroup"].replace("*", "")
1368
+ for node in self.node_to_data
1369
+ if not node.isnumeric()
1370
+ ]
1371
+ )
1372
+ )
1373
+ mt_haplogroup_to_color = {
1374
+ haplogroup: cmap(i / len(mt_haplogroups)) for i, haplogroup in enumerate(mt_haplogroups)
1375
+ }
1376
+
1377
+ # Specify alpha here instead of in nx.draw_networkx_nodes so node borders stay opaque
1378
+ face_alpha = 0.5
1379
+ male_named_node_colors = [
1380
+ to_rgba(mt_haplogroup_to_color[self.get_data(node)["mt_haplogroup"].replace("*", "")], face_alpha)
1381
+ for node in male_named_nodes
1382
+ ]
1383
+ female_named_node_colors = [
1384
+ to_rgba(mt_haplogroup_to_color[self.get_data(node)["mt_haplogroup"].replace("*", "")], face_alpha)
1385
+ for node in female_named_nodes
1386
+ ]
1387
+ male_placeholder_node_colors = [to_rgba("#e5e5e5", face_alpha) for node in male_placeholder_nodes]
1388
+ female_placeholder_node_colors = [to_rgba("#e5e5e5", face_alpha) for node in female_placeholder_nodes]
1389
+
1390
+ plt.figure(figsize=(12, 4.8), dpi=1200)
1391
+ # Scale sizes based on pedigree node count
1392
+ node_size = min(1000, 9000 / len(tree.nodes))
1393
+ # Matplotlib doesn't allow font size less than 1
1394
+ if font_size is None and plot_haplogroups:
1395
+ font_size = max(math.sqrt(node_size) / 5, 1)
1396
+ elif font_size is None and not plot_haplogroups:
1397
+ font_size = max(math.sqrt(node_size) / 4.25, 1)
1398
+ line_width = math.sqrt(node_size) / 100
1399
+
1400
+ pos = nx.nx_agraph.graphviz_layout(tree, prog="dot")
1401
+ nx.draw_networkx_nodes(
1402
+ tree,
1403
+ pos=pos,
1404
+ nodelist=male_named_nodes,
1405
+ node_shape="s",
1406
+ node_size=node_size,
1407
+ node_color=male_named_node_colors,
1408
+ edgecolors="black",
1409
+ linewidths=line_width,
1410
+ )
1411
+ nx.draw_networkx_nodes(
1412
+ tree,
1413
+ pos=pos,
1414
+ nodelist=female_named_nodes,
1415
+ node_shape="o",
1416
+ node_size=node_size,
1417
+ node_color=female_named_node_colors,
1418
+ edgecolors="black",
1419
+ linewidths=line_width,
1420
+ )
1421
+ nx.draw_networkx_nodes(
1422
+ tree,
1423
+ pos=pos,
1424
+ nodelist=male_placeholder_nodes,
1425
+ node_shape="s",
1426
+ node_size=node_size,
1427
+ node_color=male_placeholder_node_colors,
1428
+ edgecolors="black",
1429
+ linewidths=line_width,
1430
+ )
1431
+ nx.draw_networkx_nodes(
1432
+ tree,
1433
+ pos=pos,
1434
+ nodelist=female_placeholder_nodes,
1435
+ node_shape="o",
1436
+ node_size=node_size,
1437
+ node_color=female_placeholder_node_colors,
1438
+ edgecolors="black",
1439
+ linewidths=line_width,
1440
+ )
1441
+ nx.draw_networkx_labels(tree, pos=pos, labels=node_labels, font_size=font_size)
1442
+ nx.draw_networkx_edges(
1443
+ tree,
1444
+ edgelist=parent_child_edges,
1445
+ pos=pos,
1446
+ node_shape="s",
1447
+ node_size=node_size,
1448
+ width=line_width,
1449
+ arrowsize=line_width * 30,
1450
+ edge_color="black",
1451
+ )
1452
+ # Setting arrows=False causes edges to overlap their associated nodes for some reason
1453
+ nx.draw_networkx_edges(
1454
+ tree,
1455
+ edgelist=dotted_edges,
1456
+ pos=pos,
1457
+ node_shape="s",
1458
+ node_size=node_size,
1459
+ width=line_width * 1.5,
1460
+ arrowstyle="-",
1461
+ style=(0, (3, 3)),
1462
+ edge_color="blue",
1463
+ )
1464
+
1465
+ plt.axis("off")
1466
+ plt.savefig(path, bbox_inches="tight")
1467
+ plt.close()
1468
+
1469
+ def write_exact_relations(self, path: str) -> None:
1470
+ """
1471
+ Write the exact relations in the pedigree to a file.
1472
+ """
1473
+ non_placeholder_nodes = sorted(self.get_non_placeholder_nodes())
1474
+ with open(path, "w") as file:
1475
+ file.write("id1,id2,relation\n")
1476
+ for i in range(len(non_placeholder_nodes)):
1477
+ for j in range(i + 1, len(non_placeholder_nodes)):
1478
+ node1 = non_placeholder_nodes[i]
1479
+ node2 = non_placeholder_nodes[j]
1480
+
1481
+ pair_relations = self.get_relations_between_nodes(node1, node2, include_maternal_paternal=True)
1482
+ for relation, count in pair_relations.items():
1483
+ for _ in range(count):
1484
+ file.write(f"{node1},{node2},{relation}\n")