repare 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of repare might be problematic. Click here for more details.

repare/pedigree.py ADDED
@@ -0,0 +1,1214 @@
1
+ import importlib
2
+ import math
3
+ from collections import defaultdict, deque
4
+
5
+ import matplotlib.pyplot as plt
6
+ import networkx as nx
7
+
8
+
9
+ class Pedigree:
10
+ """
11
+ Describes a pedigree configuration for a set of nodes.
12
+ """
13
+
14
+ def __init__(self) -> None:
15
+ self.num_placeholders = 0
16
+ self.node_to_data: dict[str, dict[str, str | bool | float]] = dict()
17
+ self.node_to_father = defaultdict(str)
18
+ self.node_to_mother = defaultdict(str)
19
+ self.node_to_children = defaultdict(set)
20
+ self.node_to_siblings = defaultdict(set)
21
+
22
+ def __deepcopy__(self, memo: dict) -> "Pedigree":
23
+ """
24
+ Custom (faster) deepcopy implementation.
25
+ """
26
+ cls = self.__class__
27
+ new_pedigree = cls.__new__(cls)
28
+ memo[id(self)] = new_pedigree
29
+
30
+ new_pedigree.num_placeholders = self.num_placeholders
31
+ new_pedigree.node_to_data = dict()
32
+ for k, v in self.node_to_data.items():
33
+ new_pedigree.node_to_data[k] = v.copy()
34
+
35
+ new_pedigree.node_to_father = self.node_to_father.copy()
36
+ new_pedigree.node_to_mother = self.node_to_mother.copy()
37
+ new_pedigree.node_to_children = defaultdict(set)
38
+ for k, v in self.node_to_children.items():
39
+ new_pedigree.node_to_children[k] = v.copy()
40
+ new_pedigree.node_to_siblings = defaultdict(set)
41
+ for k, v in self.node_to_siblings.items():
42
+ new_pedigree.node_to_siblings[k] = v.copy()
43
+ return new_pedigree
44
+
45
+ def get_topo_sort(self) -> tuple[str, ...]:
46
+ """
47
+ Gets pedigree topological sort of the Pedigree. See https://doi.org/10.1089/cmb.2011.0254.
48
+ """
49
+ leaf_nodes: list[str] = sorted([node for node in self.node_to_data if not self.node_to_children[node]])
50
+ result: list[str] = []
51
+
52
+ # DFS
53
+ def visit(node: str) -> None:
54
+ # No father == no mother
55
+ if not self.node_to_father[node]:
56
+ assert not self.node_to_mother[node]
57
+ result.append(node)
58
+ else:
59
+ # Visit father first
60
+ visit(self.node_to_father[node])
61
+ visit(self.node_to_mother[node])
62
+ result.append(node)
63
+
64
+ for node in leaf_nodes:
65
+ visit(node)
66
+ # Break between each node's path
67
+ result.append("")
68
+
69
+ # Re-label placeholder nodes
70
+ placeholder_to_idx: dict[str, int] = {}
71
+ for i, node in enumerate(result):
72
+ if node.isnumeric():
73
+ if node not in placeholder_to_idx:
74
+ placeholder_to_idx[node] = len(placeholder_to_idx)
75
+ mt_haplogroup = self.node_to_data[node]["mt_haplogroup"]
76
+ if self.node_to_data[node]["sex"] == "M":
77
+ y_haplogroup = self.node_to_data[node]["y_haplogroup"]
78
+ # Unique identifier for placeholder
79
+ result[i] = f"M {placeholder_to_idx[node]} {mt_haplogroup} {y_haplogroup}"
80
+ else:
81
+ result[i] = f"F {placeholder_to_idx[node]} {mt_haplogroup}"
82
+ return tuple(result)
83
+
84
+ def add_node(
85
+ self,
86
+ node_id: str,
87
+ sex: str,
88
+ y_haplogroup: str,
89
+ mt_haplogroup: str,
90
+ can_have_children: bool,
91
+ can_be_inbred: bool,
92
+ years_before_present: float,
93
+ ) -> None:
94
+ """
95
+ Add a node to the pedigree. If haplogroup is unknown, set argument to empty string ("").
96
+ """
97
+ self.node_to_data[node_id]: dict[str, str | bool | float] = dict()
98
+ self.node_to_data[node_id]["sex"] = sex
99
+ if y_haplogroup and sex == "F":
100
+ raise ValueError("Only males can have y_haplogroup values.")
101
+ self.node_to_data[node_id]["y_haplogroup"] = y_haplogroup
102
+ self.node_to_data[node_id]["mt_haplogroup"] = mt_haplogroup
103
+ self.node_to_data[node_id]["can_have_children"] = can_have_children
104
+ self.node_to_data[node_id]["can_be_inbred"] = can_be_inbred
105
+ self.node_to_data[node_id]["years_before_present"] = years_before_present
106
+
107
+ def add_parent_relation(self, node1: str, node2: str) -> None:
108
+ """
109
+ Adds a parent-child relationship to the tree.
110
+ Node 1 is the parent and Node 2 is the child.
111
+ Note: Overwrites existing parent, does not merge.
112
+ """
113
+ assert node1 != node2
114
+ assert node1 in self.node_to_data and node2 in self.node_to_data
115
+
116
+ if self.node_to_data[node1]["sex"] == "M":
117
+ self.node_to_father[node2] = node1
118
+ else:
119
+ self.node_to_mother[node2] = node1
120
+ self.node_to_children[node1].add(node2)
121
+
122
+ # Add any sibling relations that are created
123
+ node2_parents = [self.node_to_father[node2], self.node_to_mother[node2]]
124
+ # Make sure Node 2 has 2 known parents
125
+ if node2_parents[0] and node2_parents[1]:
126
+ for node1_child in self.node_to_children[node1]:
127
+ if (
128
+ node1_child != node2
129
+ and [self.node_to_father[node1_child], self.node_to_mother[node1_child]] == node2_parents
130
+ ):
131
+ self.add_sibling_relation(node1_child, node2)
132
+
133
+ def add_sibling_relation(self, node1: str, node2: str) -> None:
134
+ """
135
+ Adds a sibling relationship to the tree.
136
+ Note: Does not merge parents.
137
+ """
138
+ assert node1 != node2
139
+ assert node1 in self.node_to_data and node2 in self.node_to_data
140
+
141
+ self.node_to_siblings[node1].add(node2)
142
+ self.node_to_siblings[node2].add(node1)
143
+
144
+ # Update siblings of siblings
145
+ for node1_sibling in self.node_to_siblings[node1]:
146
+ if node1_sibling != node2:
147
+ self.node_to_siblings[node1_sibling].add(node2)
148
+ self.node_to_siblings[node2].add(node1_sibling)
149
+ for node2_sibling in self.node_to_siblings[node2]:
150
+ if node1 != node2_sibling:
151
+ self.node_to_siblings[node1].add(node2_sibling)
152
+ self.node_to_siblings[node2_sibling].add(node1)
153
+
154
+ def merge_nodes(self, node1: str, node2: str) -> None:
155
+ """
156
+ Merge the two nodes as if they were one person. Note this involves merging the nodes' ancestors.
157
+ When merging nodes, if one is named and one is a placeholder, keep the named one's name.
158
+ If both are placeholders (doesn't matter) or both are named (will be pruned), just keep any name.
159
+ """
160
+ assert node1 in self.node_to_data and node2 in self.node_to_data
161
+
162
+ # Pairs of nodes to merge
163
+ pair_queue: deque[tuple[str, str]] = deque([(node1, node2)])
164
+ while pair_queue:
165
+ node1, node2 = pair_queue.popleft()
166
+ if node1 != node2:
167
+ name_to_keep = node1 if not node1.isnumeric() else node2
168
+ name_to_discard = node2 if name_to_keep == node1 else node1
169
+
170
+ # Update relations for relatives of the discarded node
171
+ name_to_discard_father = self.node_to_father[name_to_discard]
172
+ name_to_discard_mother = self.node_to_mother[name_to_discard]
173
+ if name_to_discard_father:
174
+ self.node_to_children[name_to_discard_father].remove(name_to_discard)
175
+ if name_to_discard_mother:
176
+ self.node_to_children[name_to_discard_mother].remove(name_to_discard)
177
+
178
+ for child in self.node_to_children[name_to_discard]:
179
+ # Merging a parent and child - we will see this when there is inbreeding
180
+ if name_to_keep == child:
181
+ if self.node_to_data[name_to_keep]["sex"] == "M":
182
+ del self.node_to_father[name_to_keep]
183
+ else:
184
+ del self.node_to_mother[name_to_keep]
185
+ else:
186
+ # This also handles having the correct name of the merged parents from last loop iteration
187
+ self.add_parent_relation(name_to_keep, child)
188
+
189
+ # Remove all occurrences of name_to_discard in its sibling's sibling sets first
190
+ # so that self.add_sibling_relation() does not add it back in
191
+ for sibling in self.node_to_siblings[name_to_discard]:
192
+ self.node_to_siblings[sibling].remove(name_to_discard)
193
+ for sibling in self.node_to_siblings[name_to_discard]:
194
+ if sibling != name_to_keep:
195
+ self.add_sibling_relation(sibling, name_to_keep)
196
+
197
+ # Recursively merge parent relations of Node 1 and Node 2
198
+ father1 = self.node_to_father[name_to_keep]
199
+ father2 = self.node_to_father[name_to_discard]
200
+ mother1 = self.node_to_mother[name_to_keep]
201
+ mother2 = self.node_to_mother[name_to_discard]
202
+ if father1 and father2:
203
+ pair_queue.append((father1, father2))
204
+ elif father2:
205
+ if father2 == name_to_keep:
206
+ del self.node_to_father[name_to_keep]
207
+ else:
208
+ # Switch name_to_keep's father to name_to_discard's father
209
+ self.add_parent_relation(father2, name_to_keep)
210
+ if mother1 and mother2:
211
+ pair_queue.append((mother1, mother2))
212
+ elif mother2:
213
+ if mother2 == name_to_keep:
214
+ del self.node_to_mother[name_to_keep]
215
+ else:
216
+ # Switch name_to_keep's mother to name_to_discard's mother
217
+ self.add_parent_relation(mother2, name_to_keep)
218
+
219
+ # Update any nodes in the queue whose names might have been changed
220
+ for idx, (node1, node2) in enumerate(pair_queue):
221
+ if node1 == name_to_discard and node2 == name_to_discard:
222
+ pair_queue[idx] = (name_to_keep, name_to_keep)
223
+ elif node1 == name_to_discard:
224
+ pair_queue[idx] = (name_to_keep, node2)
225
+ elif node2 == name_to_discard:
226
+ pair_queue[idx] = (node1, name_to_keep)
227
+
228
+ del self.node_to_data[name_to_discard]
229
+ del self.node_to_father[name_to_discard]
230
+ del self.node_to_mother[name_to_discard]
231
+ del self.node_to_children[name_to_discard]
232
+ del self.node_to_siblings[name_to_discard]
233
+
234
+ def check_cycles_if_merged(self, node1: str, node2: str) -> bool:
235
+ """
236
+ Returns True if merging Node 1 and Node 2 (and their ancestors) would result in a cycle.
237
+ """
238
+ # Get sets of nodes that would be merged if Node 1 and Node 2 were merged (i.e. ancestors of Node 1 and Node 2)
239
+ # Note that we get sets and not just pairs because of potential inbreeding,
240
+ # for example if one node is both a parent and grandparent of another node
241
+ merge_sets: list[set[str]] = []
242
+ merge_queue = deque([(node1, node2)])
243
+ while merge_queue:
244
+ curr_node1, curr_node2 = merge_queue.popleft()
245
+ # Update merge sets
246
+ if curr_node1 != curr_node2:
247
+ updated = False
248
+ for merge_set in merge_sets:
249
+ if curr_node1 in merge_set or curr_node2 in merge_set:
250
+ merge_set.update([curr_node1, curr_node2])
251
+ updated = True
252
+ break
253
+ if not updated:
254
+ merge_sets.append({curr_node1, curr_node2})
255
+ # Add parents to the queue
256
+ curr_father1 = self.node_to_father[curr_node1]
257
+ curr_father2 = self.node_to_father[curr_node2]
258
+ curr_mother1 = self.node_to_mother[curr_node1]
259
+ curr_mother2 = self.node_to_mother[curr_node2]
260
+ if curr_father1 and curr_father2:
261
+ merge_queue.append((curr_father1, curr_father2))
262
+ if curr_mother1 and curr_mother2:
263
+ merge_queue.append((curr_mother1, curr_mother2))
264
+
265
+ # DFS cycle detection
266
+ def dfs(node):
267
+ merged_nodes: set[str] | None = None
268
+ for merge_set in merge_sets:
269
+ if node in merge_set:
270
+ merged_nodes = merge_set
271
+ break
272
+
273
+ if merged_nodes:
274
+ if node in in_progress:
275
+ return True
276
+ if node in explored:
277
+ return False
278
+ in_progress.update(merged_nodes)
279
+ for child in [child for node in merged_nodes for child in self.node_to_children[node]]:
280
+ if dfs(child):
281
+ return True
282
+ in_progress.difference_update(merged_nodes)
283
+ explored.update(merged_nodes)
284
+ else:
285
+ if node in in_progress:
286
+ return True
287
+ if node in explored:
288
+ return False
289
+ in_progress.add(node)
290
+ for child in self.node_to_children[node]:
291
+ if dfs(child):
292
+ return True
293
+ in_progress.remove(node)
294
+ explored.add(node)
295
+ return False
296
+
297
+ explored: set[str] = set()
298
+ in_progress: set[str] = set()
299
+ # Check for cycles starting from each node
300
+ for node in self.node_to_data:
301
+ if dfs(node):
302
+ # Cycle detected
303
+ return True
304
+ return False
305
+
306
+ def fill_node_parents(self, node: str) -> None:
307
+ """
308
+ If the given node doesn't have parents, add placeholder parents.
309
+ If it does, do nothing.
310
+ """
311
+ assert node in self.node_to_data
312
+
313
+ father = self.node_to_father[node]
314
+ mother = self.node_to_mother[node]
315
+
316
+ if not father:
317
+ father_id = str(self.num_placeholders)
318
+ self.add_node(
319
+ node_id=father_id,
320
+ sex="M",
321
+ y_haplogroup="*",
322
+ mt_haplogroup="*",
323
+ can_have_children=True,
324
+ can_be_inbred=True,
325
+ years_before_present=math.nan,
326
+ )
327
+
328
+ self.add_parent_relation(father_id, node)
329
+ for sibling in self.node_to_siblings[node]:
330
+ self.add_parent_relation(father_id, sibling)
331
+ self.num_placeholders += 1
332
+
333
+ if not mother:
334
+ mother_id = str(self.num_placeholders)
335
+ self.add_node(
336
+ node_id=mother_id,
337
+ sex="F",
338
+ y_haplogroup="",
339
+ mt_haplogroup="*",
340
+ can_have_children=True,
341
+ can_be_inbred=True,
342
+ years_before_present=math.nan,
343
+ )
344
+
345
+ self.add_parent_relation(mother_id, node)
346
+ for sibling in self.node_to_siblings[node]:
347
+ self.add_parent_relation(mother_id, sibling)
348
+ self.num_placeholders += 1
349
+
350
+ def update_haplogroups(self) -> None:
351
+ """
352
+ Update haplogroups of placeholder nodes.
353
+ """
354
+ for node in self.node_to_data:
355
+ y_haplogroup: str = self.node_to_data[node]["y_haplogroup"]
356
+ y_lineage: deque[str] = deque(
357
+ [self.node_to_father[node]]
358
+ + [child for child in self.node_to_children[node] if self.node_to_data[child]["sex"] == "M"]
359
+ )
360
+
361
+ while y_lineage:
362
+ curr_node = y_lineage.popleft()
363
+ if (
364
+ not curr_node
365
+ or "*" not in self.node_to_data[curr_node]["y_haplogroup"]
366
+ or self.node_to_data[curr_node]["y_haplogroup"].rstrip("*") == y_haplogroup.rstrip("*")
367
+ ):
368
+ continue
369
+ # Overwrite/extend Y haplogroup if it contains a "*" and is a strict subset of the "leaf" haplogroup
370
+ if y_haplogroup.startswith(self.node_to_data[curr_node]["y_haplogroup"].rstrip("*")):
371
+ self.node_to_data[curr_node]["y_haplogroup"] = (
372
+ y_haplogroup if y_haplogroup[-1] == "*" else y_haplogroup + "*"
373
+ )
374
+ y_lineage.append(self.node_to_father[curr_node])
375
+ for curr_node_child in self.node_to_children[curr_node]:
376
+ # Only males have Y chromosome
377
+ if self.node_to_data[curr_node_child]["sex"] == "M":
378
+ y_lineage.append(curr_node_child)
379
+
380
+ mt_haplogroup: str = self.node_to_data[node]["mt_haplogroup"]
381
+ mt_lineage: deque[str] = deque([self.node_to_mother[node]])
382
+ # Only females pass on mitochondrial DNA to children
383
+ if self.node_to_data[node]["sex"] == "F":
384
+ mt_lineage.extend(self.node_to_children[node])
385
+
386
+ while mt_lineage:
387
+ curr_node = mt_lineage.popleft()
388
+ if (
389
+ not curr_node
390
+ or "*" not in self.node_to_data[curr_node]["mt_haplogroup"]
391
+ or self.node_to_data[curr_node]["mt_haplogroup"].rstrip("*") == mt_haplogroup.rstrip("*")
392
+ ):
393
+ continue
394
+ # Overwrite/extend mitochondrial haplogroup if it contains a "*"
395
+ # and is a strict subset of the "leaf" haplogroup
396
+ if mt_haplogroup.startswith(self.node_to_data[curr_node]["mt_haplogroup"].rstrip("*")):
397
+ self.node_to_data[curr_node]["mt_haplogroup"] = (
398
+ mt_haplogroup if mt_haplogroup[-1] == "*" else mt_haplogroup + "*"
399
+ )
400
+ mt_lineage.append(self.node_to_mother[curr_node])
401
+ if self.node_to_data[curr_node]["sex"] == "F":
402
+ mt_lineage.extend(self.node_to_children[curr_node])
403
+
404
+ def validate_members(self, members: set) -> bool:
405
+ """
406
+ Validates this tree based on the member nodes it should contain.
407
+ """
408
+ non_placeholder_nodes = self.get_non_placeholder_nodes()
409
+ # Return False if pedigree doesn't have all the nodes it's supposed to (because of invalid merging)
410
+ return non_placeholder_nodes == members
411
+
412
+ def validate_haplogroups(self) -> bool:
413
+ """
414
+ Validates that all haplogroups are consistent.
415
+ """
416
+
417
+ def haplogroups_agree(haplogroup1: str, haplogroup2: str) -> bool:
418
+ if "*" not in haplogroup1 and "*" not in haplogroup2:
419
+ return haplogroup1 == haplogroup2
420
+ elif "*" in haplogroup1 and "*" in haplogroup2:
421
+ return haplogroup1.startswith(haplogroup2.rstrip("*")) or haplogroup2.startswith(
422
+ haplogroup1.rstrip("*")
423
+ )
424
+ elif "*" in haplogroup1:
425
+ return haplogroup2.startswith(haplogroup1.rstrip("*"))
426
+ else:
427
+ return haplogroup1.startswith(haplogroup2.rstrip("*"))
428
+
429
+ for parent, child in self.get_parent_child_pairs():
430
+ if self.node_to_data[parent]["sex"] == "F":
431
+ if not haplogroups_agree(
432
+ self.node_to_data[parent]["mt_haplogroup"], self.node_to_data[child]["mt_haplogroup"]
433
+ ):
434
+ return False
435
+ elif self.node_to_data[parent]["sex"] == "M" and self.node_to_data[child]["sex"] == "M":
436
+ if not haplogroups_agree(
437
+ self.node_to_data[parent]["y_haplogroup"], self.node_to_data[child]["y_haplogroup"]
438
+ ):
439
+ return False
440
+ return True
441
+
442
+ def validate_can_have_children(self) -> bool:
443
+ """
444
+ Validates that nodes that cannot have children do not have children.
445
+ """
446
+ for node in self.get_non_placeholder_nodes():
447
+ if len(self.node_to_children[node]) > 0 and not self.node_to_data[node]["can_have_children"]:
448
+ return False
449
+ return True
450
+
451
+ def validate_inbreeding(self) -> bool:
452
+ """
453
+ Validates that nodes that are known to be not inbred are not inbred.
454
+ """
455
+ related_pairs = self.get_related_pairs()
456
+ for node in self.get_non_placeholder_nodes():
457
+ if not self.node_to_data[node]["can_be_inbred"]:
458
+ father = self.node_to_father[node]
459
+ mother = self.node_to_mother[node]
460
+ if (father, mother) in related_pairs or (mother, father) in related_pairs:
461
+ return False
462
+ return True
463
+
464
+ def validate_years_before_present(self) -> bool:
465
+ """
466
+ Validates that nodes do not postdate their descendants.
467
+ """
468
+ leaf_nodes: list[str] = [node for node in self.node_to_data if not self.node_to_children[node]]
469
+
470
+ # DFS
471
+ def visit(node: str, curr_years_before_present: float) -> None:
472
+ years_before_present = self.node_to_data[node]["years_before_present"]
473
+ if not math.isnan(years_before_present):
474
+ # Node postdates its descendants
475
+ if years_before_present < curr_years_before_present:
476
+ return False
477
+ else:
478
+ curr_years_before_present = years_before_present
479
+
480
+ if self.node_to_father[node]:
481
+ assert self.node_to_mother[node]
482
+ if not visit(self.node_to_father[node], curr_years_before_present):
483
+ return False
484
+ if not visit(self.node_to_mother[node], curr_years_before_present):
485
+ return False
486
+ return True
487
+
488
+ for node in leaf_nodes:
489
+ if not visit(node, float("-inf")):
490
+ return False
491
+ return True
492
+
493
+ def validate_forced_constraints(
494
+ self, pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]]
495
+ ) -> bool:
496
+ """
497
+ Validates that forced constraints so far are present in the pedigree.
498
+ Note: Additional relations between two nodes are allowed as long as the forced constraints are present.
499
+ """
500
+ for (node1, node2), degree_constraints in pair_to_relations_so_far.items():
501
+ for _, constraints, force_constraints in degree_constraints:
502
+ if force_constraints and not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
503
+ return False
504
+ return True
505
+
506
+ def count_inconsistencies(
507
+ self,
508
+ pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]],
509
+ pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]],
510
+ check_half_siblings: bool,
511
+ ) -> tuple[int, list[tuple[str, str, str]]]:
512
+ """
513
+ Validates this tree based on the input relation data.
514
+ If check_half_siblings is False, don't check for extraneous half-sibling relations
515
+ because the 2 non-shared parents might be merged later.
516
+ Returns count of inconsistencies with the input data as well as a log of the inconsistencies.
517
+ Note: pair_to_constraints values must be sorted by increasing length
518
+ so that specific constraints are checked first.
519
+ """
520
+ for node1, node2 in pair_to_constraints:
521
+ # Ensure no duplicate/symmetric entries
522
+ assert (node2, node1) not in pair_to_constraints
523
+ # Marks which entries in pair_to_constraints have been seen/used
524
+ pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
525
+
526
+ def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
527
+ if (node1, node2) in pair_to_constraints:
528
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
529
+ if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
530
+ return True
531
+ return False
532
+
533
+ def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
534
+ if (node1, node2) in pair_to_constraints:
535
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
536
+ if relation in constraints:
537
+ pair_to_constraints_seen_entries[(node1, node2)].add(idx)
538
+ break
539
+
540
+ def validate_relation(node1: str, node2: str, relation: str, strike_log: list[str, str, str, str]) -> None:
541
+ relation_to_degree = {
542
+ "parent-child": "1",
543
+ "child-parent": "1",
544
+ "siblings": "1",
545
+ "maternal aunt/uncle-nephew/niece": "2",
546
+ "maternal nephew/niece-aunt/uncle": "2",
547
+ "paternal aunt/uncle-nephew/niece": "2",
548
+ "paternal nephew/niece-aunt/uncle": "2",
549
+ "maternal grandparent-grandchild": "2",
550
+ "maternal grandchild-grandparent": "2",
551
+ "paternal grandparent-grandchild": "2",
552
+ "paternal grandchild-grandparent": "2",
553
+ "maternal half-siblings": "2",
554
+ "paternal half-siblings": "2",
555
+ }
556
+ flipped_relations = {
557
+ "parent-child": "child-parent",
558
+ "child-parent": "parent-child",
559
+ "maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
560
+ "paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
561
+ "maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
562
+ "paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
563
+ "maternal grandparent-grandchild": "maternal grandchild-grandparent",
564
+ "paternal grandparent-grandchild": "paternal grandchild-grandparent",
565
+ "maternal grandchild-grandparent": "maternal grandparent-grandchild",
566
+ "paternal grandchild-grandparent": "paternal grandparent-grandchild",
567
+ "siblings": "siblings", # Symmetric
568
+ "maternal half-siblings": "maternal half-siblings", # Symmetric
569
+ "paternal half-siblings": "paternal half-siblings", # Symmetric
570
+ }
571
+ if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
572
+ node2, node1, flipped_relations[relation]
573
+ ):
574
+ strike_log.append((node1, node2, f"+{relation_to_degree[relation]}", ""))
575
+ remove_relation_from_input_data(node1, node2, relation)
576
+ remove_relation_from_input_data(node2, node1, flipped_relations[relation])
577
+
578
+ strike_log: list[str, str, str, str] = [] # (node1, node2, +/- relation degree, constraints)
579
+ # Check that relations in the pedigree are present in the input data
580
+ for parent, child in self.get_parent_child_pairs(include_placeholders=False):
581
+ validate_relation(parent, child, "parent-child", strike_log)
582
+ for sibling1, sibling2 in self.get_sibling_pairs(include_placeholders=False):
583
+ validate_relation(sibling1, sibling2, "siblings", strike_log)
584
+
585
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
586
+ include_placeholders=False, shared_relative_sex="F"
587
+ ):
588
+ validate_relation(aunt_uncle, nephew_niece, "maternal aunt/uncle-nephew/niece", strike_log)
589
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
590
+ include_placeholders=False, shared_relative_sex="M"
591
+ ):
592
+ validate_relation(aunt_uncle, nephew_niece, "paternal aunt/uncle-nephew/niece", strike_log)
593
+
594
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
595
+ include_placeholders=False, shared_relative_sex="F"
596
+ ):
597
+ validate_relation(grandparent, grandchild, "maternal grandparent-grandchild", strike_log)
598
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
599
+ include_placeholders=False, shared_relative_sex="M"
600
+ ):
601
+ validate_relation(grandparent, grandchild, "paternal grandparent-grandchild", strike_log)
602
+
603
+ if check_half_siblings:
604
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
605
+ include_placeholders=False, shared_relative_sex="F"
606
+ ):
607
+ validate_relation(half_sibling1, half_sibling2, "maternal half-siblings", strike_log)
608
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
609
+ include_placeholders=False, shared_relative_sex="M"
610
+ ):
611
+ validate_relation(half_sibling1, half_sibling2, "paternal half-siblings", strike_log)
612
+
613
+ # Check for "dropped" input relations
614
+ # Note: We use constrained relations instead of all relations because we want to catch half-siblings
615
+ # that explicitly should be some other relation even when check_half_siblings is False
616
+ # The purpose of check_half_siblings is to avoid marking *incidental* half-siblings,
617
+ # not half-siblings that should be something else
618
+ for (node1, node2), degrees_constraints in pair_to_relations_so_far.items():
619
+ # If only one input relation between these two nodes, simple check is much faster
620
+ if len(degrees_constraints) == 1:
621
+ degree, constraints, _ = degrees_constraints[0]
622
+ if not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
623
+ strike_log.append((node1, node2, f"-{degree}", constraints))
624
+ else:
625
+ pedigree_shared_relations: defaultdict(int) = self.get_relations_between_nodes(
626
+ node1, node2, include_maternal_paternal=True
627
+ )
628
+ for degree, constraints, _ in degrees_constraints:
629
+ present_flag = False
630
+ for constraint in constraints.split(";"):
631
+ if constraint in pedigree_shared_relations:
632
+ present_flag = True
633
+ pedigree_shared_relations[constraint] -= 1
634
+ if pedigree_shared_relations[constraint] == 0:
635
+ del pedigree_shared_relations[constraint]
636
+ break
637
+ if not present_flag:
638
+ strike_log.append((node1, node2, f"-{degree}", constraints))
639
+
640
+ # Count # of strikes (will not equal len(strike_log) because we don't want to double-count *changed* relations)
641
+ strike_count: int = 0
642
+ node_pair_strike_balances: defaultdict[tuple[str, str], int] = defaultdict(int)
643
+ for node1, node2, strike, _ in strike_log:
644
+ if strike[0] == "+":
645
+ if node_pair_strike_balances[(node1, node2)] >= 0:
646
+ strike_count += 1
647
+ node_pair_strike_balances[(node1, node2)] += 1
648
+ node_pair_strike_balances[(node2, node1)] += 1
649
+
650
+ elif strike[0] == "-":
651
+ if node_pair_strike_balances[(node1, node2)] <= 0:
652
+ strike_count += 1
653
+ node_pair_strike_balances[(node1, node2)] -= 1
654
+ node_pair_strike_balances[(node2, node1)] -= 1
655
+ return strike_count, strike_log
656
+
657
+ def count_third_degree_inconcistencies(
658
+ self, pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]]
659
+ ) -> int:
660
+ """
661
+ Counts only one-sided inconsistencies in third-degree relations.
662
+ Used as a "tie-breaker" for 1st- and 2nd-degree inconsistences.
663
+ """
664
+ for node1, node2 in pair_to_constraints:
665
+ # Ensure no duplicate/symmetric entries
666
+ assert (node2, node1) not in pair_to_constraints
667
+ # Marks which entries in pair_to_constraints have been seen/used
668
+ pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
669
+
670
+ def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
671
+ if (node1, node2) in pair_to_constraints:
672
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
673
+ if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
674
+ return True
675
+ return False
676
+
677
+ def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
678
+ if (node1, node2) in pair_to_constraints:
679
+ for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
680
+ if relation in constraints:
681
+ pair_to_constraints_seen_entries[(node1, node2)].add(idx)
682
+ break
683
+
684
+ def validate_relation(node1: str, node2: str, relation: str) -> bool:
685
+ flipped_relations = {
686
+ "half aunt/uncle-half nephew/niece": "half nephew/niece-half aunt/uncle",
687
+ "half nephew/niece-half aunt/uncle": "half aunt/uncle-half nephew/niece",
688
+ "greatgrandparent-greatgrandchild": "greatgrandchild-greatgrandparent",
689
+ "greatgrandchild-greatgrandparent": "greatgrandparent-greatgrandchild",
690
+ "grandaunt/granduncle-grandnephew/grandniece": "grandnephew/grandniece-grandaunt/granduncle",
691
+ "grandnephew/grandniece-grandaunt/granduncle": "grandaunt/granduncle-grandnephew/grandniece",
692
+ "first cousins": "first cousins", # Symmetric
693
+ }
694
+ ret = False
695
+ if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
696
+ node2, node1, flipped_relations[relation]
697
+ ):
698
+ ret = True
699
+ remove_relation_from_input_data(node1, node2, relation)
700
+ remove_relation_from_input_data(node2, node1, flipped_relations[relation])
701
+ return ret
702
+
703
+ strike_count: int = 0
704
+ for half_aunt_uncle, half_nephew_niece in self.get_half_aunt_uncle_nephew_niece_pairs(
705
+ include_placeholders=False
706
+ ):
707
+ strike_count += validate_relation(half_aunt_uncle, half_nephew_niece, "half aunt/uncle-half nephew/niece")
708
+ for greatgrandparent, greatgrandchild in self.get_greatgrandparent_greatgrandchild_pairs(
709
+ include_placeholders=False
710
+ ):
711
+ strike_count += validate_relation(greatgrandparent, greatgrandchild, "greatgrandparent-greatgrandchild")
712
+ for grandaunt_granduncle, grandnephew_grandniece in self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(
713
+ include_placeholders=False
714
+ ):
715
+ strike_count += validate_relation(
716
+ grandaunt_granduncle, grandnephew_grandniece, "grandaunt/granduncle-grandnephew/grandniece"
717
+ )
718
+ for first_cousin1, first_cousin2 in self.get_first_cousin_pairs(include_placeholders=False):
719
+ strike_count += validate_relation(first_cousin1, first_cousin2, "first cousins")
720
+ return strike_count
721
+
722
+ def is_relation_in_pedigree(self, node1: str, node2: str, relations_list: list[str]) -> bool:
723
+ """
724
+ Returns True if *any* of the relations in relations_list are present between node1 and node2 in the pedigree.
725
+ """
726
+ assert node1 in self.node_to_data and node2 in self.node_to_data
727
+
728
+ for relation in relations_list:
729
+ if relation == "parent-child":
730
+ if node2 in self.node_to_children[node1]:
731
+ return True
732
+ if relation == "child-parent":
733
+ if node1 in self.node_to_children[node2]:
734
+ return True
735
+ if relation == "siblings":
736
+ if node2 in self.node_to_siblings[node1]:
737
+ assert node1 in self.node_to_siblings[node2]
738
+ return True
739
+
740
+ if relation == "aunt/uncle-nephew/niece":
741
+ for sibling in self.node_to_siblings[node1]:
742
+ if node2 in self.node_to_children[sibling]:
743
+ return True
744
+ if relation == "nephew/niece-aunt/uncle":
745
+ for sibling in self.node_to_siblings[node2]:
746
+ if node1 in self.node_to_children[sibling]:
747
+ return True
748
+ if relation == "grandparent-grandchild":
749
+ for child in self.node_to_children[node1]:
750
+ if node2 in self.node_to_children[child]:
751
+ return True
752
+ if relation == "grandchild-grandparent":
753
+ for child in self.node_to_children[node2]:
754
+ if node1 in self.node_to_children[child]:
755
+ return True
756
+ if relation == "half-siblings":
757
+ if self.node_to_father[node2]:
758
+ if (
759
+ node1 in self.node_to_children[self.node_to_father[node2]]
760
+ and self.node_to_mother[node1] != self.node_to_mother[node2]
761
+ ):
762
+ return True
763
+ if self.node_to_mother[node2]:
764
+ if (
765
+ node1 in self.node_to_children[self.node_to_mother[node2]]
766
+ and self.node_to_father[node1] != self.node_to_father[node2]
767
+ ):
768
+ return True
769
+
770
+ if relation == "maternal aunt/uncle-nephew/niece":
771
+ for sibling in self.node_to_siblings[node1]:
772
+ if self.node_to_data[sibling]["sex"] == "F" and node2 in self.node_to_children[sibling]:
773
+ return True
774
+ if relation == "paternal aunt/uncle-nephew/niece":
775
+ for sibling in self.node_to_siblings[node1]:
776
+ if self.node_to_data[sibling]["sex"] == "M" and node2 in self.node_to_children[sibling]:
777
+ return True
778
+ if relation == "maternal nephew/niece-aunt/uncle":
779
+ for sibling in self.node_to_siblings[node2]:
780
+ if self.node_to_data[sibling]["sex"] == "F" and node1 in self.node_to_children[sibling]:
781
+ return True
782
+ if relation == "paternal nephew/niece-aunt/uncle":
783
+ for sibling in self.node_to_siblings[node2]:
784
+ if self.node_to_data[sibling]["sex"] == "M" and node1 in self.node_to_children[sibling]:
785
+ return True
786
+
787
+ if relation == "maternal grandparent-grandchild":
788
+ for child in self.node_to_children[node1]:
789
+ if self.node_to_data[child]["sex"] == "F" and node2 in self.node_to_children[child]:
790
+ return True
791
+ if relation == "paternal grandparent-grandchild":
792
+ for child in self.node_to_children[node1]:
793
+ if self.node_to_data[child]["sex"] == "M" and node2 in self.node_to_children[child]:
794
+ return True
795
+ if relation == "maternal grandchild-grandparent":
796
+ for child in self.node_to_children[node2]:
797
+ if self.node_to_data[child]["sex"] == "F" and node1 in self.node_to_children[child]:
798
+ return True
799
+ if relation == "paternal grandchild-grandparent":
800
+ for child in self.node_to_children[node2]:
801
+ if self.node_to_data[child]["sex"] == "M" and node1 in self.node_to_children[child]:
802
+ return True
803
+
804
+ if relation == "paternal half-siblings":
805
+ if self.node_to_father[node2]:
806
+ if (
807
+ node1 in self.node_to_children[self.node_to_father[node2]]
808
+ and self.node_to_mother[node1] != self.node_to_mother[node2]
809
+ ):
810
+ return True
811
+ if relation == "maternal half-siblings":
812
+ if self.node_to_mother[node2]:
813
+ if (
814
+ node1 in self.node_to_children[self.node_to_mother[node2]]
815
+ and self.node_to_father[node1] != self.node_to_father[node2]
816
+ ):
817
+ return True
818
+ return False
819
+
820
+ def get_relations_between_nodes(
821
+ self, node1: str, node2: str, include_maternal_paternal: bool = False
822
+ ) -> defaultdict[str, int]:
823
+ """
824
+ Returns a dictionary of the *1st- and 2nd-degree* relations between node1 and node2.
825
+ """
826
+ relations = defaultdict(int)
827
+ if self.is_relation_in_pedigree(node1, node2, ["parent-child"]):
828
+ relations["parent-child"] += 1
829
+ if self.is_relation_in_pedigree(node1, node2, ["child-parent"]):
830
+ relations["child-parent"] += 1
831
+ if self.is_relation_in_pedigree(node1, node2, ["siblings"]):
832
+ relations["siblings"] += 1
833
+
834
+ if self.is_relation_in_pedigree(node1, node2, ["maternal aunt/uncle-nephew/niece"]):
835
+ relations["maternal aunt/uncle-nephew/niece"] += 1
836
+ if self.is_relation_in_pedigree(node1, node2, ["paternal aunt/uncle-nephew/niece"]):
837
+ relations["paternal aunt/uncle-nephew/niece"] += 1
838
+ if self.is_relation_in_pedigree(node1, node2, ["maternal nephew/niece-aunt/uncle"]):
839
+ relations["maternal nephew/niece-aunt/uncle"] += 1
840
+ if self.is_relation_in_pedigree(node1, node2, ["paternal nephew/niece-aunt/uncle"]):
841
+ relations["paternal nephew/niece-aunt/uncle"] += 1
842
+
843
+ if self.is_relation_in_pedigree(node1, node2, ["maternal grandparent-grandchild"]):
844
+ relations["maternal grandparent-grandchild"] += 1
845
+ if self.is_relation_in_pedigree(node1, node2, ["paternal grandparent-grandchild"]):
846
+ relations["paternal grandparent-grandchild"] += 1
847
+ if self.is_relation_in_pedigree(node1, node2, ["maternal grandchild-grandparent"]):
848
+ relations["maternal grandchild-grandparent"] += 1
849
+ if self.is_relation_in_pedigree(node1, node2, ["paternal grandchild-grandparent"]):
850
+ relations["paternal grandchild-grandparent"] += 1
851
+
852
+ if self.is_relation_in_pedigree(node1, node2, ["maternal half-siblings"]):
853
+ relations["maternal half-siblings"] += 1
854
+ if self.is_relation_in_pedigree(node1, node2, ["paternal half-siblings"]):
855
+ relations["paternal half-siblings"] += 1
856
+
857
+ if not include_maternal_paternal:
858
+ relations["aunt/uncle-nephew/niece"] = (
859
+ relations["maternal aunt/uncle-nephew/niece"] + relations["paternal aunt/uncle-nephew/niece"]
860
+ )
861
+ relations["nephew/niece-aunt/uncle"] = (
862
+ relations["maternal nephew/niece-aunt/uncle"] + relations["paternal nephew/niece-aunt/uncle"]
863
+ )
864
+ relations["grandparent-grandchild"] = (
865
+ relations["maternal grandparent-grandchild"] + relations["paternal grandparent-grandchild"]
866
+ )
867
+ relations["grandchild-grandparent"] = (
868
+ relations["maternal grandchild-grandparent"] + relations["paternal grandchild-grandparent"]
869
+ )
870
+ relations["half-siblings"] = relations["maternal half-siblings"] + relations["paternal half-siblings"]
871
+ for relation in [
872
+ "maternal aunt/uncle-nephew/niece",
873
+ "paternal aunt/uncle-nephew/niece",
874
+ "maternal nephew/niece-aunt/uncle",
875
+ "paternal nephew/niece-aunt/uncle",
876
+ "maternal grandparent-grandchild",
877
+ "paternal grandparent-grandchild",
878
+ "maternal grandchild-grandparent",
879
+ "paternal grandchild-grandparent",
880
+ "maternal half-siblings",
881
+ "paternal half-siblings",
882
+ ]:
883
+ del relations[relation]
884
+
885
+ relations_to_remove = set()
886
+ for relation, count in relations.items():
887
+ if count == 0:
888
+ relations_to_remove.add(relation)
889
+ for relation in relations_to_remove:
890
+ del relations[relation]
891
+ return relations
892
+
893
+ def get_parent_child_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
894
+ """
895
+ Gets all (parent, child) pairs in the tree.
896
+ """
897
+ parent_child_pairs: list[tuple[str, str]] = []
898
+ for parent in self.node_to_children:
899
+ for child in self.node_to_children[parent]:
900
+ if include_placeholders or (not parent.isnumeric() and not child.isnumeric()):
901
+ parent_child_pairs.append((parent, child))
902
+ return parent_child_pairs
903
+
904
+ def get_sibling_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
905
+ """
906
+ Gets all (sibling, sibling) pairs in the tree.
907
+ Note: Only gets *full* siblings. See self.get_half_sibling_pairs().
908
+ """
909
+ sibling_pairs: list[tuple[str, str]] = []
910
+ for sibling1 in self.node_to_siblings:
911
+ for sibling2 in self.node_to_siblings[sibling1]:
912
+ if include_placeholders or (not sibling1.isnumeric() and not sibling2.isnumeric()):
913
+ # Don't add symmetric duplicates
914
+ if (sibling2, sibling1) not in sibling_pairs:
915
+ sibling_pairs.append((sibling1, sibling2))
916
+ return sibling_pairs
917
+
918
+ def get_aunt_uncle_nephew_niece_pairs(
919
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
920
+ ) -> list[tuple[str, str]]:
921
+ """
922
+ Gets all (aunt/uncle, nephew/niece) pairs in the tree.
923
+ Includes duplicates if, for example, shared_relative_sex=None and an aunt is
924
+ both a maternal and paternal aunt to a nephew (i.e., full-sib mating).
925
+ """
926
+ aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
927
+ for parent, child in self.get_parent_child_pairs():
928
+ for parent_sibling in self.node_to_siblings[parent]:
929
+ if not shared_relative_sex or self.node_to_data[parent]["sex"] == shared_relative_sex:
930
+ if include_placeholders or (not parent_sibling.isnumeric() and not child.isnumeric()):
931
+ aunt_uncle_nephew_niece_pairs.append((parent_sibling, child))
932
+ return aunt_uncle_nephew_niece_pairs
933
+
934
+ def get_grandparent_grandchild_pairs(
935
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
936
+ ) -> list[tuple[str, str]]:
937
+ """
938
+ Gets all (grandparent, grandchild) pairs in the tree.
939
+ Includes duplicates if, for example, a grandparent is both a maternal and paternal grandparent to a grandchild.
940
+ """
941
+ grandparent_grandchild_pairs: list[tuple[str, str]] = []
942
+ for parent, child in self.get_parent_child_pairs():
943
+ for child_child in self.node_to_children[child]:
944
+ if not shared_relative_sex or self.node_to_data[child]["sex"] == shared_relative_sex:
945
+ if include_placeholders or (not parent.isnumeric() and not child_child.isnumeric()):
946
+ grandparent_grandchild_pairs.append((parent, child_child))
947
+ return grandparent_grandchild_pairs
948
+
949
+ def get_half_sibling_pairs(
950
+ self, include_placeholders: bool = True, shared_relative_sex: str | None = None
951
+ ) -> list[tuple[str, str]]:
952
+ """
953
+ Gets all (half-sibling, half-sibling) pairs in the tree.
954
+ """
955
+ half_sibling_pairs: list[tuple[str, str]] = []
956
+ for parent, child in self.get_parent_child_pairs():
957
+ for other_child in self.node_to_children[parent]:
958
+ if child != other_child and other_child not in self.node_to_siblings[child]:
959
+ if not shared_relative_sex or self.node_to_data[parent]["sex"] == shared_relative_sex:
960
+ if include_placeholders or (not child.isnumeric() and not other_child.isnumeric()):
961
+ # Don't add symmetric duplicates
962
+ if (other_child, child) not in half_sibling_pairs:
963
+ half_sibling_pairs.append((child, other_child))
964
+ return half_sibling_pairs
965
+
966
+ def get_half_aunt_uncle_nephew_niece_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
967
+ """
968
+ Gets all (half-aunt/half-uncle, half-nephew/half-niece) pairs in the tree.
969
+ """
970
+ half_aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
971
+ for half_sibling1, half_sibling2 in self.get_half_sibling_pairs():
972
+ for half_sibling1_child in self.node_to_children[half_sibling1]:
973
+ if half_sibling1_child != half_sibling2:
974
+ if include_placeholders or (not half_sibling2.isnumeric() and not half_sibling1_child.isnumeric()):
975
+ half_aunt_uncle_nephew_niece_pairs.append((half_sibling2, half_sibling1_child))
976
+
977
+ for half_sibling2_child in self.node_to_children[half_sibling2]:
978
+ if half_sibling2_child != half_sibling1:
979
+ if include_placeholders or (not half_sibling1.isnumeric() and not half_sibling2_child.isnumeric()):
980
+ half_aunt_uncle_nephew_niece_pairs.append((half_sibling1, half_sibling2_child))
981
+ return half_aunt_uncle_nephew_niece_pairs
982
+
983
+ def get_greatgrandparent_greatgrandchild_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
984
+ """
985
+ Gets all (greatgrandparent, greatgrandchild) pairs in the tree.
986
+ """
987
+ greatgrandparent_greatgrandchild_pairs: list[tuple[str, str]] = []
988
+ for grandparent, grandchild in self.get_grandparent_grandchild_pairs():
989
+ for grandchild_child in self.node_to_children[grandchild]:
990
+ if include_placeholders or (not grandparent.isnumeric() and not grandchild_child.isnumeric()):
991
+ greatgrandparent_greatgrandchild_pairs.append((grandparent, grandchild_child))
992
+ return greatgrandparent_greatgrandchild_pairs
993
+
994
+ def get_grandaunt_granduncle_grandnephew_grandniece_pairs(
995
+ self, include_placeholders: bool = True
996
+ ) -> list[tuple[str, str]]:
997
+ """
998
+ Gets all (grandaunt/uncle, grandnephew/niece) pairs in the tree.
999
+ """
1000
+ grandaunt_granduncle_grandnephew_grandniece_pairs: list[tuple[str, str]] = []
1001
+ for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs():
1002
+ for nephew_niece_child in self.node_to_children[nephew_niece]:
1003
+ if include_placeholders or (not aunt_uncle.isnumeric() and not nephew_niece_child.isnumeric()):
1004
+ grandaunt_granduncle_grandnephew_grandniece_pairs.append((aunt_uncle, nephew_niece_child))
1005
+ return grandaunt_granduncle_grandnephew_grandniece_pairs
1006
+
1007
+ def get_first_cousin_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
1008
+ """
1009
+ Gets all (first cousin, first cousin) pairs in the tree.
1010
+ """
1011
+ cousin_pairs: list[tuple[str, str]] = []
1012
+ for aunt_uncle, child in self.get_aunt_uncle_nephew_niece_pairs():
1013
+ for aunt_uncle_child in self.node_to_children[aunt_uncle]:
1014
+ if include_placeholders or (not child.isnumeric() and not aunt_uncle_child.isnumeric()):
1015
+ # Don't add symmetric duplicates
1016
+ if aunt_uncle_child != child and (aunt_uncle_child, child) not in cousin_pairs:
1017
+ cousin_pairs.append((child, aunt_uncle_child))
1018
+ return cousin_pairs
1019
+
1020
+ def get_related_pairs(self, include_placeholders: bool = True) -> set[tuple[str, str]]:
1021
+ """
1022
+ Gets all related pairs (up to and including 3rd-degree relations) in the pedigree.
1023
+ """
1024
+ related_pairs: set[tuple[str, str]] = set()
1025
+ related_pairs.update(self.get_parent_child_pairs(include_placeholders=include_placeholders))
1026
+ related_pairs.update(self.get_sibling_pairs(include_placeholders=include_placeholders))
1027
+ related_pairs.update(self.get_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
1028
+ related_pairs.update(self.get_grandparent_grandchild_pairs(include_placeholders=include_placeholders))
1029
+ related_pairs.update(self.get_half_sibling_pairs(include_placeholders=include_placeholders))
1030
+ related_pairs.update(self.get_half_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
1031
+ related_pairs.update(self.get_greatgrandparent_greatgrandchild_pairs(include_placeholders=include_placeholders))
1032
+ related_pairs.update(
1033
+ self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(include_placeholders=include_placeholders)
1034
+ )
1035
+ related_pairs.update(self.get_first_cousin_pairs(include_placeholders=include_placeholders))
1036
+ return related_pairs
1037
+
1038
+ def get_non_placeholder_nodes(self) -> set[str]:
1039
+ """
1040
+ Gets all non-placeholder nodes in the tree.
1041
+ """
1042
+ return set([node for node in self.node_to_data if not node.isnumeric()])
1043
+
1044
+ def clean_up_relations(self) -> None:
1045
+ """
1046
+ Remove any empty entries in the relation dictionaries.
1047
+ Also remove unnecessary placeholder nodes to standardize topological sort output.
1048
+ """
1049
+ for relation_dict in [self.node_to_father, self.node_to_mother, self.node_to_children, self.node_to_siblings]:
1050
+ keys_to_remove = set()
1051
+ for k, v in relation_dict.items():
1052
+ if not v:
1053
+ keys_to_remove.add(k)
1054
+ for key_to_remove in keys_to_remove:
1055
+ del relation_dict[key_to_remove]
1056
+
1057
+ placeholder_nodes_to_remove: set[str] = set()
1058
+ for node in self.node_to_data:
1059
+ mother = self.node_to_mother[node]
1060
+ father = self.node_to_father[node]
1061
+ if mother.isnumeric() and father.isnumeric():
1062
+ if len(self.node_to_children[mother]) == 1 and len(self.node_to_children[father]) == 1:
1063
+ if (
1064
+ not self.node_to_mother[mother]
1065
+ and not self.node_to_father[mother]
1066
+ and not self.node_to_mother[father]
1067
+ and not self.node_to_father[father]
1068
+ ):
1069
+ placeholder_nodes_to_remove.add(mother)
1070
+ placeholder_nodes_to_remove.add(father)
1071
+
1072
+ for node in placeholder_nodes_to_remove:
1073
+ for data_dict in [
1074
+ self.node_to_data,
1075
+ self.node_to_father,
1076
+ self.node_to_mother,
1077
+ self.node_to_children,
1078
+ self.node_to_siblings,
1079
+ ]:
1080
+ if node in data_dict:
1081
+ del data_dict[node]
1082
+
1083
+ for node in self.node_to_data:
1084
+ assert node not in self.node_to_siblings[node] and node not in self.node_to_children[node]
1085
+ if self.node_to_father[node] in placeholder_nodes_to_remove:
1086
+ del self.node_to_father[node]
1087
+ if self.node_to_mother[node] in placeholder_nodes_to_remove:
1088
+ del self.node_to_mother[node]
1089
+
1090
+ def plot(self, path: str) -> None:
1091
+ """
1092
+ Plot the pedigree to the given path.
1093
+ """
1094
+ if not importlib.util.find_spec("pygraphviz"):
1095
+ raise ImportError("Plotting pedigree requires PyGraphviz (https://pygraphviz.github.io/).")
1096
+
1097
+ tree = nx.from_dict_of_lists(self.node_to_children, create_using=nx.DiGraph)
1098
+ # Add childless nodes
1099
+ for node in self.node_to_data:
1100
+ if node not in tree.nodes:
1101
+ tree.add_node(node)
1102
+ male_named_nodes = [
1103
+ node for node in self.node_to_data if self.node_to_data[node]["sex"] == "M" and not node.isnumeric()
1104
+ ]
1105
+ male_placeholder_nodes = [
1106
+ node for node in self.node_to_data if self.node_to_data[node]["sex"] == "M" and node.isnumeric()
1107
+ ]
1108
+ female_named_nodes = [
1109
+ node for node in self.node_to_data if self.node_to_data[node]["sex"] == "F" and not node.isnumeric()
1110
+ ]
1111
+ female_placeholder_nodes = [
1112
+ node for node in self.node_to_data if self.node_to_data[node]["sex"] == "F" and node.isnumeric()
1113
+ ]
1114
+
1115
+ non_placeholder_labels = dict()
1116
+ for node in tree.nodes:
1117
+ mt_haplogroup = self.node_to_data[node]["mt_haplogroup"][:3]
1118
+ y_haplogroup = self.node_to_data[node]["y_haplogroup"][:3]
1119
+ if node.isnumeric():
1120
+ if y_haplogroup:
1121
+ non_placeholder_labels[node] = f"{mt_haplogroup}\n{y_haplogroup}"
1122
+ else:
1123
+ non_placeholder_labels[node] = f"{mt_haplogroup}"
1124
+ else:
1125
+ if y_haplogroup:
1126
+ non_placeholder_labels[node] = f"{node}\n{mt_haplogroup}\n{y_haplogroup}"
1127
+ else:
1128
+ non_placeholder_labels[node] = f"{node}\n{mt_haplogroup}"
1129
+
1130
+ # Create colormap for MT haplogroups
1131
+ cmap = plt.get_cmap("tab20")
1132
+ mt_haplogroups = sorted(
1133
+ set([self.node_to_data[node]["mt_haplogroup"] for node in self.node_to_data if not node.isnumeric()])
1134
+ )
1135
+ mt_haplogroup_to_color = {
1136
+ haplogroup: cmap(i / len(mt_haplogroups)) for i, haplogroup in enumerate(mt_haplogroups)
1137
+ }
1138
+ male_named_node_colors = [
1139
+ mt_haplogroup_to_color[self.node_to_data[node]["mt_haplogroup"]] for node in male_named_nodes
1140
+ ]
1141
+ female_named_node_colors = [
1142
+ mt_haplogroup_to_color[self.node_to_data[node]["mt_haplogroup"]] for node in female_named_nodes
1143
+ ]
1144
+
1145
+ plt.figure(figsize=(12, 4.8), dpi=1200)
1146
+ # Scale sizes based on pedigree node count
1147
+ node_size = min(1000, 10000 / len(tree.nodes))
1148
+ # Matplotlib doesn't allow font size less than 1
1149
+ font_size = max(min(4.5, 150 / len(tree.nodes)), 1)
1150
+
1151
+ pos = nx.nx_agraph.graphviz_layout(tree, prog="dot")
1152
+ nx.draw_networkx_nodes(
1153
+ tree,
1154
+ pos=pos,
1155
+ nodelist=male_named_nodes,
1156
+ node_shape="s",
1157
+ node_size=node_size,
1158
+ node_color=male_named_node_colors,
1159
+ edgecolors="black",
1160
+ linewidths=0.2,
1161
+ )
1162
+ nx.draw_networkx_nodes(
1163
+ tree,
1164
+ pos=pos,
1165
+ nodelist=female_named_nodes,
1166
+ node_shape="o",
1167
+ node_size=node_size,
1168
+ node_color=female_named_node_colors,
1169
+ edgecolors="black",
1170
+ linewidths=0.2,
1171
+ )
1172
+ nx.draw_networkx_nodes(
1173
+ tree,
1174
+ pos=pos,
1175
+ nodelist=male_placeholder_nodes,
1176
+ node_shape="s",
1177
+ node_size=node_size,
1178
+ node_color="#e5e5e5",
1179
+ edgecolors="black",
1180
+ linewidths=0.2,
1181
+ )
1182
+ nx.draw_networkx_nodes(
1183
+ tree,
1184
+ pos=pos,
1185
+ nodelist=female_placeholder_nodes,
1186
+ node_shape="o",
1187
+ node_size=node_size,
1188
+ node_color="#e5e5e5",
1189
+ edgecolors="black",
1190
+ linewidths=0.2,
1191
+ )
1192
+ nx.draw_networkx_labels(tree, pos=pos, labels=non_placeholder_labels, font_size=font_size)
1193
+ nx.draw_networkx_edges(tree, pos=pos, node_shape="s", node_size=node_size, width=0.2, arrowsize=font_size * 1.2)
1194
+
1195
+ plt.axis("off")
1196
+ plt.savefig(path, bbox_inches="tight")
1197
+ plt.close()
1198
+
1199
+ def write_exact_relations(self, path: str) -> None:
1200
+ """
1201
+ Write the exact relations in the pedigree to a file.
1202
+ """
1203
+ non_placeholder_nodes = sorted(self.get_non_placeholder_nodes())
1204
+ with open(path, "w") as file:
1205
+ file.write("node1,node2,relation\n")
1206
+ for i in range(len(non_placeholder_nodes)):
1207
+ for j in range(i + 1, len(non_placeholder_nodes)):
1208
+ node1 = non_placeholder_nodes[i]
1209
+ node2 = non_placeholder_nodes[j]
1210
+
1211
+ pair_relations = self.get_relations_between_nodes(node1, node2, include_maternal_paternal=True)
1212
+ for relation, count in pair_relations.items():
1213
+ for _ in range(count):
1214
+ file.write(f"{node1},{node2},{relation}\n")