repare 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repare/__init__.py +4 -0
- repare/main.py +89 -0
- repare/pedigree.py +1484 -0
- repare/pedigree_reconstructor.py +1186 -0
- repare-0.1.4.dist-info/METADATA +143 -0
- repare-0.1.4.dist-info/RECORD +10 -0
- repare-0.1.4.dist-info/WHEEL +5 -0
- repare-0.1.4.dist-info/entry_points.txt +2 -0
- repare-0.1.4.dist-info/licenses/LICENSE +7 -0
- repare-0.1.4.dist-info/top_level.txt +1 -0
repare/pedigree.py
ADDED
|
@@ -0,0 +1,1484 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import math
|
|
3
|
+
from collections import defaultdict, deque
|
|
4
|
+
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import networkx as nx
|
|
7
|
+
from matplotlib.colors import to_rgba
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Pedigree:
|
|
11
|
+
"""
|
|
12
|
+
Describes a pedigree configuration for a set of nodes.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self) -> None:
|
|
16
|
+
self.num_placeholders: int = 0
|
|
17
|
+
self.node_to_data: dict[str, dict[str, str | bool | float]] = dict()
|
|
18
|
+
self.node_to_father: dict[str, str] = dict()
|
|
19
|
+
self.node_to_mother: dict[str, str] = dict()
|
|
20
|
+
self.node_to_children: dict[str, set[str]] = dict()
|
|
21
|
+
self.node_to_siblings: dict[str, set[str]] = dict()
|
|
22
|
+
|
|
23
|
+
def __deepcopy__(self, memo: dict) -> "Pedigree":
|
|
24
|
+
"""
|
|
25
|
+
Custom (faster) deepcopy implementation.
|
|
26
|
+
"""
|
|
27
|
+
cls = self.__class__
|
|
28
|
+
new_pedigree = cls.__new__(cls)
|
|
29
|
+
memo[id(self)] = new_pedigree
|
|
30
|
+
|
|
31
|
+
new_pedigree.num_placeholders = self.num_placeholders
|
|
32
|
+
new_pedigree.node_to_data = dict()
|
|
33
|
+
for k, v in self.node_to_data.items():
|
|
34
|
+
new_pedigree.node_to_data[k] = v.copy()
|
|
35
|
+
|
|
36
|
+
new_pedigree.node_to_father = self.node_to_father.copy()
|
|
37
|
+
new_pedigree.node_to_mother = self.node_to_mother.copy()
|
|
38
|
+
new_pedigree.node_to_children = dict()
|
|
39
|
+
for k, v in self.node_to_children.items():
|
|
40
|
+
new_pedigree.node_to_children[k] = v.copy()
|
|
41
|
+
new_pedigree.node_to_siblings = dict()
|
|
42
|
+
for k, v in self.node_to_siblings.items():
|
|
43
|
+
new_pedigree.node_to_siblings[k] = v.copy()
|
|
44
|
+
return new_pedigree
|
|
45
|
+
|
|
46
|
+
def get_topo_sort(self) -> tuple[str, ...]:
|
|
47
|
+
"""
|
|
48
|
+
Gets pedigree topological sort of the Pedigree. See https://doi.org/10.1089/cmb.2011.0254.
|
|
49
|
+
"""
|
|
50
|
+
leaf_nodes: list[str] = sorted([node for node in self.node_to_data if not self.get_children(node)])
|
|
51
|
+
result: list[str] = []
|
|
52
|
+
|
|
53
|
+
def dfs(node: str) -> None:
|
|
54
|
+
# No father == no mother
|
|
55
|
+
if not self.get_father(node):
|
|
56
|
+
assert not self.get_mother(node)
|
|
57
|
+
result.append(node)
|
|
58
|
+
else:
|
|
59
|
+
# Visit father first
|
|
60
|
+
dfs(self.get_father(node))
|
|
61
|
+
dfs(self.get_mother(node))
|
|
62
|
+
result.append(node)
|
|
63
|
+
|
|
64
|
+
for node in leaf_nodes:
|
|
65
|
+
dfs(node)
|
|
66
|
+
# Break between each node's path
|
|
67
|
+
result.append("")
|
|
68
|
+
|
|
69
|
+
# Re-label placeholder nodes
|
|
70
|
+
placeholder_to_idx: dict[str, int] = {}
|
|
71
|
+
for i, node in enumerate(result):
|
|
72
|
+
if node.isnumeric():
|
|
73
|
+
if node not in placeholder_to_idx:
|
|
74
|
+
placeholder_to_idx[node] = len(placeholder_to_idx)
|
|
75
|
+
|
|
76
|
+
node_data = self.get_data(node)
|
|
77
|
+
mt_haplogroup = node_data["mt_haplogroup"]
|
|
78
|
+
if node_data["sex"] == "M":
|
|
79
|
+
y_haplogroup = node_data["y_haplogroup"]
|
|
80
|
+
# Unique identifier for placeholder
|
|
81
|
+
result[i] = f"M {placeholder_to_idx[node]} {mt_haplogroup} {y_haplogroup}"
|
|
82
|
+
else:
|
|
83
|
+
result[i] = f"F {placeholder_to_idx[node]} {mt_haplogroup}"
|
|
84
|
+
return tuple(result)
|
|
85
|
+
|
|
86
|
+
def get_data(self, node: str) -> dict[str, str | bool | float]:
|
|
87
|
+
"""
|
|
88
|
+
Returns the data for the given node.
|
|
89
|
+
"""
|
|
90
|
+
assert node in self.node_to_data
|
|
91
|
+
return self.node_to_data[node]
|
|
92
|
+
|
|
93
|
+
def get_father(self, node: str) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Returns the father of the given node.
|
|
96
|
+
If the node has no father, returns "".
|
|
97
|
+
"""
|
|
98
|
+
assert node in self.node_to_data
|
|
99
|
+
return self.node_to_father.get(node, "")
|
|
100
|
+
|
|
101
|
+
def get_mother(self, node: str) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Returns the mother of the given node.
|
|
104
|
+
If the node has no mother, returns "".
|
|
105
|
+
"""
|
|
106
|
+
assert node in self.node_to_data
|
|
107
|
+
return self.node_to_mother.get(node, "")
|
|
108
|
+
|
|
109
|
+
def get_children(self, node: str) -> set[str]:
|
|
110
|
+
"""
|
|
111
|
+
Returns the children of the given node.
|
|
112
|
+
If the node has no children, returns an empty set.
|
|
113
|
+
"""
|
|
114
|
+
assert node in self.node_to_data
|
|
115
|
+
return self.node_to_children.get(node, set())
|
|
116
|
+
|
|
117
|
+
def get_siblings(self, node: str) -> set[str]:
|
|
118
|
+
"""
|
|
119
|
+
Returns the siblings of the given node.
|
|
120
|
+
If the node has no siblings, returns an empty set.
|
|
121
|
+
"""
|
|
122
|
+
assert node in self.node_to_data
|
|
123
|
+
return self.node_to_siblings.get(node, set())
|
|
124
|
+
|
|
125
|
+
def add_node(
|
|
126
|
+
self,
|
|
127
|
+
node_id: str,
|
|
128
|
+
sex: str,
|
|
129
|
+
y_haplogroup: str,
|
|
130
|
+
mt_haplogroup: str,
|
|
131
|
+
can_have_children: bool,
|
|
132
|
+
can_be_inbred: bool,
|
|
133
|
+
years_before_present: float,
|
|
134
|
+
) -> None:
|
|
135
|
+
"""
|
|
136
|
+
Add a node to the pedigree. If haplogroup is unknown, set argument to empty string ("").
|
|
137
|
+
"""
|
|
138
|
+
self.node_to_data[node_id] = dict()
|
|
139
|
+
self.node_to_data[node_id]["sex"] = sex
|
|
140
|
+
if y_haplogroup and sex == "F":
|
|
141
|
+
raise ValueError("Only males can have y_haplogroup values.")
|
|
142
|
+
self.node_to_data[node_id]["y_haplogroup"] = y_haplogroup
|
|
143
|
+
self.node_to_data[node_id]["mt_haplogroup"] = mt_haplogroup
|
|
144
|
+
self.node_to_data[node_id]["can_have_children"] = can_have_children
|
|
145
|
+
self.node_to_data[node_id]["can_be_inbred"] = can_be_inbred
|
|
146
|
+
self.node_to_data[node_id]["years_before_present"] = years_before_present
|
|
147
|
+
|
|
148
|
+
def add_parent_relation(self, node1: str, node2: str) -> None:
|
|
149
|
+
"""
|
|
150
|
+
Adds a parent-child relationship to the tree.
|
|
151
|
+
Node 1 is the parent and Node 2 is the child.
|
|
152
|
+
Note: Overwrites existing parent, does not merge.
|
|
153
|
+
"""
|
|
154
|
+
assert node1 != node2
|
|
155
|
+
assert node1 in self.node_to_data and node2 in self.node_to_data
|
|
156
|
+
|
|
157
|
+
def clear_siblings(node: str) -> None:
|
|
158
|
+
for sibling in self.get_siblings(node):
|
|
159
|
+
self.node_to_siblings[sibling].remove(node)
|
|
160
|
+
if node in self.node_to_siblings:
|
|
161
|
+
del self.node_to_siblings[node]
|
|
162
|
+
|
|
163
|
+
# Remove child from original parent
|
|
164
|
+
if self.get_data(node1)["sex"] == "M":
|
|
165
|
+
if self.get_father(node2):
|
|
166
|
+
self.node_to_children[self.get_father(node2)].remove(node2)
|
|
167
|
+
clear_siblings(node2)
|
|
168
|
+
self.node_to_father[node2] = node1
|
|
169
|
+
else:
|
|
170
|
+
if self.get_mother(node2):
|
|
171
|
+
self.node_to_children[self.get_mother(node2)].remove(node2)
|
|
172
|
+
clear_siblings(node2)
|
|
173
|
+
self.node_to_mother[node2] = node1
|
|
174
|
+
|
|
175
|
+
if node1 not in self.node_to_children:
|
|
176
|
+
self.node_to_children[node1] = set()
|
|
177
|
+
self.node_to_children[node1].add(node2)
|
|
178
|
+
|
|
179
|
+
# Add any sibling relations that are created
|
|
180
|
+
node2_parents = [self.get_father(node2), self.get_mother(node2)]
|
|
181
|
+
# Make sure Node 2 has 2 known parents
|
|
182
|
+
if node2_parents[0] and node2_parents[1]:
|
|
183
|
+
for node1_child in self.get_children(node1):
|
|
184
|
+
if (
|
|
185
|
+
node1_child != node2
|
|
186
|
+
and [self.get_father(node1_child), self.get_mother(node1_child)] == node2_parents
|
|
187
|
+
):
|
|
188
|
+
self.add_sibling_relation(node1_child, node2)
|
|
189
|
+
|
|
190
|
+
def add_sibling_relation(self, node1: str, node2: str) -> None:
|
|
191
|
+
"""
|
|
192
|
+
Adds a sibling relationship to the tree.
|
|
193
|
+
Note: Does not merge parents.
|
|
194
|
+
"""
|
|
195
|
+
assert node1 != node2
|
|
196
|
+
assert node1 in self.node_to_data and node2 in self.node_to_data
|
|
197
|
+
|
|
198
|
+
if node1 not in self.node_to_siblings:
|
|
199
|
+
self.node_to_siblings[node1] = set()
|
|
200
|
+
if node2 not in self.node_to_siblings:
|
|
201
|
+
self.node_to_siblings[node2] = set()
|
|
202
|
+
self.node_to_siblings[node1].add(node2)
|
|
203
|
+
self.node_to_siblings[node2].add(node1)
|
|
204
|
+
|
|
205
|
+
# Update siblings of siblings
|
|
206
|
+
for node1_sibling in self.get_siblings(node1):
|
|
207
|
+
if node1_sibling != node2:
|
|
208
|
+
self.node_to_siblings[node1_sibling].add(node2)
|
|
209
|
+
self.node_to_siblings[node2].add(node1_sibling)
|
|
210
|
+
for node2_sibling in self.get_siblings(node2):
|
|
211
|
+
if node1 != node2_sibling:
|
|
212
|
+
self.node_to_siblings[node1].add(node2_sibling)
|
|
213
|
+
self.node_to_siblings[node2_sibling].add(node1)
|
|
214
|
+
|
|
215
|
+
def merge_nodes(self, node1: str, node2: str) -> bool:
|
|
216
|
+
"""
|
|
217
|
+
Merge the two nodes as if they were one person. Note this involves merging the nodes' ancestors.
|
|
218
|
+
Returns True if the merge was successful, False if it was invalid.
|
|
219
|
+
"""
|
|
220
|
+
assert node1 in self.node_to_data and node2 in self.node_to_data
|
|
221
|
+
|
|
222
|
+
# Pairs of nodes to merge
|
|
223
|
+
pair_queue: deque[tuple[str, str]] = deque([(node1, node2)])
|
|
224
|
+
while pair_queue:
|
|
225
|
+
node1, node2 = pair_queue.popleft()
|
|
226
|
+
if node1 == node2:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Cannot merge two named nodes
|
|
230
|
+
if not node1.isnumeric() and not node2.isnumeric():
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
# Cannot merge nodes if it will create a node that is the parent and sibling of another
|
|
234
|
+
# For example, don't merge a parent and child if the child has siblings,
|
|
235
|
+
# and don't merge a node's sibling and child
|
|
236
|
+
combined_parents = set(
|
|
237
|
+
[
|
|
238
|
+
self.get_father(node1),
|
|
239
|
+
self.get_mother(node1),
|
|
240
|
+
self.get_father(node2),
|
|
241
|
+
self.get_mother(node2),
|
|
242
|
+
]
|
|
243
|
+
)
|
|
244
|
+
combined_children = self.get_children(node1) | self.get_children(node2)
|
|
245
|
+
combined_siblings = self.get_siblings(node1) | self.get_siblings(node2)
|
|
246
|
+
if (combined_parents & combined_siblings) or (combined_children & combined_siblings):
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
name_to_keep = node1 if not node1.isnumeric() else node2
|
|
250
|
+
name_to_discard = node2 if name_to_keep == node1 else node1
|
|
251
|
+
|
|
252
|
+
# Update relations for relatives of the discarded node
|
|
253
|
+
name_to_discard_father = self.get_father(name_to_discard)
|
|
254
|
+
name_to_discard_mother = self.get_mother(name_to_discard)
|
|
255
|
+
if name_to_discard_father:
|
|
256
|
+
assert self.get_children(name_to_discard_father)
|
|
257
|
+
self.node_to_children[name_to_discard_father].remove(name_to_discard)
|
|
258
|
+
if name_to_discard_mother:
|
|
259
|
+
assert self.get_children(name_to_discard_mother)
|
|
260
|
+
self.node_to_children[name_to_discard_mother].remove(name_to_discard)
|
|
261
|
+
|
|
262
|
+
name_to_discard_children = set()
|
|
263
|
+
for child in self.get_children(name_to_discard):
|
|
264
|
+
# Merging a parent and child - we will see this when there is inbreeding
|
|
265
|
+
# Note: can’t merge a parent and a child if the child has siblings,
|
|
266
|
+
# because then the child becomes the parent of its siblings (this is
|
|
267
|
+
# handled by check_invalid_parent_child_merge)
|
|
268
|
+
if name_to_keep == child:
|
|
269
|
+
if self.get_data(name_to_keep)["sex"] == "M":
|
|
270
|
+
del self.node_to_father[name_to_keep]
|
|
271
|
+
else:
|
|
272
|
+
del self.node_to_mother[name_to_keep]
|
|
273
|
+
else:
|
|
274
|
+
name_to_discard_children.add(child)
|
|
275
|
+
|
|
276
|
+
for child in name_to_discard_children:
|
|
277
|
+
# This step also handles having the correct name of the merged parents from last loop iteration
|
|
278
|
+
self.add_parent_relation(name_to_keep, child)
|
|
279
|
+
|
|
280
|
+
# Remove all occurrences of name_to_discard in its sibling's sibling sets first
|
|
281
|
+
# so that add_sibling_relation does not add it back in.
|
|
282
|
+
for sibling in self.get_siblings(name_to_discard):
|
|
283
|
+
self.node_to_siblings[sibling].remove(name_to_discard)
|
|
284
|
+
for sibling in self.get_siblings(name_to_discard):
|
|
285
|
+
if sibling != name_to_keep:
|
|
286
|
+
self.add_sibling_relation(sibling, name_to_keep)
|
|
287
|
+
|
|
288
|
+
# Recursively merge parent relations of Node 1 and Node 2
|
|
289
|
+
father1 = self.get_father(name_to_keep)
|
|
290
|
+
father2 = self.get_father(name_to_discard)
|
|
291
|
+
mother1 = self.get_mother(name_to_keep)
|
|
292
|
+
mother2 = self.get_mother(name_to_discard)
|
|
293
|
+
if father1 and father2:
|
|
294
|
+
pair_queue.append((father1, father2))
|
|
295
|
+
elif father2 and father2 != name_to_keep:
|
|
296
|
+
# Set name_to_keep's father to name_to_discard's father
|
|
297
|
+
self.add_parent_relation(father2, name_to_keep)
|
|
298
|
+
|
|
299
|
+
if mother1 and mother2:
|
|
300
|
+
pair_queue.append((mother1, mother2))
|
|
301
|
+
elif mother2 and mother2 != name_to_keep:
|
|
302
|
+
# Set name_to_keep's mother to name_to_discard's mother
|
|
303
|
+
self.add_parent_relation(mother2, name_to_keep)
|
|
304
|
+
|
|
305
|
+
# Update any nodes in the queue whose names might have been changed
|
|
306
|
+
for idx, (node1, node2) in enumerate(pair_queue):
|
|
307
|
+
if node1 == name_to_discard and node2 == name_to_discard:
|
|
308
|
+
pair_queue[idx] = (name_to_keep, name_to_keep)
|
|
309
|
+
elif node1 == name_to_discard:
|
|
310
|
+
pair_queue[idx] = (name_to_keep, node2)
|
|
311
|
+
elif node2 == name_to_discard:
|
|
312
|
+
pair_queue[idx] = (node1, name_to_keep)
|
|
313
|
+
|
|
314
|
+
for data_dict in [
|
|
315
|
+
self.node_to_data,
|
|
316
|
+
self.node_to_father,
|
|
317
|
+
self.node_to_mother,
|
|
318
|
+
self.node_to_children,
|
|
319
|
+
self.node_to_siblings,
|
|
320
|
+
]:
|
|
321
|
+
if name_to_discard in data_dict:
|
|
322
|
+
del data_dict[name_to_discard]
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
def check_valid_merge(self, node1: str, node2: str) -> bool:
|
|
326
|
+
"""
|
|
327
|
+
Returns True if merging Node 1 and Node 2 (and their ancestors) is a valid operation.
|
|
328
|
+
"""
|
|
329
|
+
assert node1 in self.node_to_data and node2 in self.node_to_data
|
|
330
|
+
# Get sets of nodes that would be merged if Node 1 and Node 2 were merged (i.e. ancestors of Node 1 and Node 2)
|
|
331
|
+
# Note that we get sets and not just pairs because of potential inbreeding,
|
|
332
|
+
# for example if one node is both a parent and grandparent of another node
|
|
333
|
+
merge_sets: list[set[str]] = []
|
|
334
|
+
merge_queue: deque[set[str]] = deque([set([node1, node2])])
|
|
335
|
+
included_nodes: set[str] = set()
|
|
336
|
+
|
|
337
|
+
while merge_queue:
|
|
338
|
+
curr_nodes = merge_queue.popleft()
|
|
339
|
+
# If all nodes are the same, skip
|
|
340
|
+
if len(set(curr_nodes)) == 1:
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# Update merge sets
|
|
344
|
+
updated = False
|
|
345
|
+
for merge_set in merge_sets:
|
|
346
|
+
if merge_set == curr_nodes:
|
|
347
|
+
updated = True
|
|
348
|
+
break
|
|
349
|
+
|
|
350
|
+
if any(node in merge_set for node in curr_nodes):
|
|
351
|
+
merge_set.update(curr_nodes)
|
|
352
|
+
# Include all merged nodes in the current set of nodes
|
|
353
|
+
curr_nodes = merge_set
|
|
354
|
+
updated = True
|
|
355
|
+
break
|
|
356
|
+
if not updated:
|
|
357
|
+
merge_sets.append(set(curr_nodes))
|
|
358
|
+
included_nodes.update(curr_nodes)
|
|
359
|
+
|
|
360
|
+
# Add parents to the queue
|
|
361
|
+
curr_fathers = set([self.get_father(node) for node in curr_nodes if self.get_father(node)])
|
|
362
|
+
curr_mothers = set([self.get_mother(node) for node in curr_nodes if self.get_mother(node)])
|
|
363
|
+
if len(curr_fathers) > 1 and not curr_fathers.issubset(included_nodes):
|
|
364
|
+
merge_queue.append(curr_fathers)
|
|
365
|
+
if len(curr_mothers) > 1 and not curr_mothers.issubset(included_nodes):
|
|
366
|
+
merge_queue.append(curr_mothers)
|
|
367
|
+
|
|
368
|
+
if self.check_cycle_merge(merge_sets):
|
|
369
|
+
return False
|
|
370
|
+
return True
|
|
371
|
+
|
|
372
|
+
def check_cycle_merge(self, merge_sets: list[set[str]]) -> bool:
|
|
373
|
+
"""
|
|
374
|
+
Returns True if merging Node 1 and Node 2 (and their ancestors) would result in a cycle.
|
|
375
|
+
merge_sets is a list of node sets that would be merged if Node 1 and Node 2 were merged.
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
# DFS cycle detection
|
|
379
|
+
def dfs(node) -> bool:
|
|
380
|
+
nodes_to_merge: set[str] | None = None
|
|
381
|
+
for merge_set in merge_sets:
|
|
382
|
+
if node in merge_set:
|
|
383
|
+
nodes_to_merge = merge_set
|
|
384
|
+
break
|
|
385
|
+
|
|
386
|
+
if nodes_to_merge:
|
|
387
|
+
if node in in_progress:
|
|
388
|
+
return True
|
|
389
|
+
if node in explored:
|
|
390
|
+
return False
|
|
391
|
+
in_progress.update(nodes_to_merge)
|
|
392
|
+
for child in [child for node in nodes_to_merge for child in self.get_children(node)]:
|
|
393
|
+
if dfs(child):
|
|
394
|
+
return True
|
|
395
|
+
in_progress.difference_update(nodes_to_merge)
|
|
396
|
+
explored.update(nodes_to_merge)
|
|
397
|
+
else:
|
|
398
|
+
if node in in_progress:
|
|
399
|
+
return True
|
|
400
|
+
if node in explored:
|
|
401
|
+
return False
|
|
402
|
+
in_progress.add(node)
|
|
403
|
+
for child in self.get_children(node):
|
|
404
|
+
if dfs(child):
|
|
405
|
+
return True
|
|
406
|
+
in_progress.remove(node)
|
|
407
|
+
explored.add(node)
|
|
408
|
+
return False
|
|
409
|
+
|
|
410
|
+
explored: set[str] = set()
|
|
411
|
+
in_progress: set[str] = set()
|
|
412
|
+
# Check for cycles starting from each node
|
|
413
|
+
for node in self.node_to_data:
|
|
414
|
+
if dfs(node):
|
|
415
|
+
# Cycle detected
|
|
416
|
+
return True
|
|
417
|
+
return False
|
|
418
|
+
|
|
419
|
+
def fill_node_parents(self, node: str) -> None:
|
|
420
|
+
"""
|
|
421
|
+
If the given node doesn't have parents, add placeholder parents.
|
|
422
|
+
If it does, do nothing.
|
|
423
|
+
"""
|
|
424
|
+
assert node in self.node_to_data
|
|
425
|
+
|
|
426
|
+
father = self.get_father(node)
|
|
427
|
+
mother = self.get_mother(node)
|
|
428
|
+
|
|
429
|
+
if not father:
|
|
430
|
+
father_id = str(self.num_placeholders)
|
|
431
|
+
self.add_node(
|
|
432
|
+
node_id=father_id,
|
|
433
|
+
sex="M",
|
|
434
|
+
y_haplogroup="*",
|
|
435
|
+
mt_haplogroup="*",
|
|
436
|
+
can_have_children=True,
|
|
437
|
+
can_be_inbred=True,
|
|
438
|
+
years_before_present=math.nan,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
self.add_parent_relation(father_id, node)
|
|
442
|
+
for sibling in self.get_siblings(node):
|
|
443
|
+
self.add_parent_relation(father_id, sibling)
|
|
444
|
+
self.num_placeholders += 1
|
|
445
|
+
|
|
446
|
+
if not mother:
|
|
447
|
+
mother_id = str(self.num_placeholders)
|
|
448
|
+
self.add_node(
|
|
449
|
+
node_id=mother_id,
|
|
450
|
+
sex="F",
|
|
451
|
+
y_haplogroup="",
|
|
452
|
+
mt_haplogroup="*",
|
|
453
|
+
can_have_children=True,
|
|
454
|
+
can_be_inbred=True,
|
|
455
|
+
years_before_present=math.nan,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
self.add_parent_relation(mother_id, node)
|
|
459
|
+
for sibling in self.get_siblings(node):
|
|
460
|
+
self.add_parent_relation(mother_id, sibling)
|
|
461
|
+
self.num_placeholders += 1
|
|
462
|
+
|
|
463
|
+
def update_haplogroups(self) -> None:
|
|
464
|
+
"""
|
|
465
|
+
Update haplogroups of placeholder nodes.
|
|
466
|
+
"""
|
|
467
|
+
for node in self.node_to_data:
|
|
468
|
+
y_haplogroup: str = self.get_data(node)["y_haplogroup"]
|
|
469
|
+
y_lineage: deque[str] = deque(
|
|
470
|
+
[self.get_father(node)]
|
|
471
|
+
+ [child for child in self.get_children(node) if self.get_data(child)["sex"] == "M"]
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
while y_lineage:
|
|
475
|
+
curr_node = y_lineage.popleft()
|
|
476
|
+
if (
|
|
477
|
+
not curr_node
|
|
478
|
+
or "*" not in self.get_data(curr_node)["y_haplogroup"]
|
|
479
|
+
or self.get_data(curr_node)["y_haplogroup"].rstrip("*") == y_haplogroup.rstrip("*")
|
|
480
|
+
):
|
|
481
|
+
continue
|
|
482
|
+
# Overwrite/extend Y haplogroup if it contains a "*" and is a strict subset of the "leaf" haplogroup
|
|
483
|
+
if y_haplogroup.startswith(self.get_data(curr_node)["y_haplogroup"].rstrip("*")):
|
|
484
|
+
self.node_to_data[curr_node]["y_haplogroup"] = (
|
|
485
|
+
y_haplogroup if y_haplogroup[-1] == "*" else y_haplogroup + "*"
|
|
486
|
+
)
|
|
487
|
+
y_lineage.append(self.get_father(curr_node))
|
|
488
|
+
for curr_node_child in self.get_children(curr_node):
|
|
489
|
+
# Only males have Y chromosome
|
|
490
|
+
if self.get_data(curr_node_child)["sex"] == "M":
|
|
491
|
+
y_lineage.append(curr_node_child)
|
|
492
|
+
|
|
493
|
+
mt_haplogroup: str = self.get_data(node)["mt_haplogroup"]
|
|
494
|
+
mt_lineage: deque[str] = deque([self.get_mother(node)])
|
|
495
|
+
# Only females pass on mitochondrial DNA to children
|
|
496
|
+
if self.get_data(node)["sex"] == "F":
|
|
497
|
+
mt_lineage.extend(self.get_children(node))
|
|
498
|
+
|
|
499
|
+
while mt_lineage:
|
|
500
|
+
curr_node = mt_lineage.popleft()
|
|
501
|
+
if (
|
|
502
|
+
not curr_node
|
|
503
|
+
or "*" not in self.get_data(curr_node)["mt_haplogroup"]
|
|
504
|
+
or self.get_data(curr_node)["mt_haplogroup"].rstrip("*") == mt_haplogroup.rstrip("*")
|
|
505
|
+
):
|
|
506
|
+
continue
|
|
507
|
+
# Overwrite/extend mitochondrial haplogroup if it contains a "*"
|
|
508
|
+
# and is a strict subset of the "leaf" haplogroup
|
|
509
|
+
if mt_haplogroup.startswith(self.get_data(curr_node)["mt_haplogroup"].rstrip("*")):
|
|
510
|
+
self.node_to_data[curr_node]["mt_haplogroup"] = (
|
|
511
|
+
mt_haplogroup if mt_haplogroup[-1] == "*" else mt_haplogroup + "*"
|
|
512
|
+
)
|
|
513
|
+
mt_lineage.append(self.get_mother(curr_node))
|
|
514
|
+
if self.get_data(curr_node)["sex"] == "F":
|
|
515
|
+
mt_lineage.extend(self.get_children(curr_node))
|
|
516
|
+
|
|
517
|
+
def validate_structure(self) -> bool:
|
|
518
|
+
"""
|
|
519
|
+
Validates pedigree structure and consistency of internal data.
|
|
520
|
+
"""
|
|
521
|
+
for child, father in self.node_to_father.items():
|
|
522
|
+
if child not in self.node_to_children[father]:
|
|
523
|
+
return False
|
|
524
|
+
if child == father:
|
|
525
|
+
return False
|
|
526
|
+
|
|
527
|
+
for child, mother in self.node_to_mother.items():
|
|
528
|
+
if child not in self.node_to_children[mother]:
|
|
529
|
+
return False
|
|
530
|
+
if child == mother:
|
|
531
|
+
return False
|
|
532
|
+
|
|
533
|
+
for parent, children in self.node_to_children.items():
|
|
534
|
+
for child in children:
|
|
535
|
+
if self.get_data(parent)["sex"] == "M":
|
|
536
|
+
if parent != self.node_to_father[child]:
|
|
537
|
+
return False
|
|
538
|
+
else:
|
|
539
|
+
if parent != self.node_to_mother[child]:
|
|
540
|
+
return False
|
|
541
|
+
if parent == child:
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
for node, siblings in self.node_to_siblings.items():
|
|
545
|
+
for sibling in siblings:
|
|
546
|
+
if node not in self.node_to_siblings[sibling]:
|
|
547
|
+
return False
|
|
548
|
+
if (
|
|
549
|
+
self.node_to_father[node] != self.node_to_father[sibling]
|
|
550
|
+
or self.node_to_mother[node] != self.node_to_mother[sibling]
|
|
551
|
+
):
|
|
552
|
+
return False
|
|
553
|
+
if node == sibling:
|
|
554
|
+
return False
|
|
555
|
+
return True
|
|
556
|
+
|
|
557
|
+
def validate_members(self, members: set[str]) -> bool:
|
|
558
|
+
"""
|
|
559
|
+
Validates this tree based on the member nodes it should contain.
|
|
560
|
+
"""
|
|
561
|
+
non_placeholder_nodes = self.get_non_placeholder_nodes()
|
|
562
|
+
# Return False if pedigree doesn't have all the nodes it's supposed to (because of invalid merging)
|
|
563
|
+
return non_placeholder_nodes == members
|
|
564
|
+
|
|
565
|
+
def validate_haplogroups(self) -> bool:
|
|
566
|
+
"""
|
|
567
|
+
Validates that all haplogroups are consistent.
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
def haplogroups_agree(haplogroup1: str, haplogroup2: str) -> bool:
|
|
571
|
+
if "*" not in haplogroup1 and "*" not in haplogroup2:
|
|
572
|
+
return haplogroup1 == haplogroup2
|
|
573
|
+
elif "*" in haplogroup1 and "*" in haplogroup2:
|
|
574
|
+
return haplogroup1.startswith(haplogroup2.rstrip("*")) or haplogroup2.startswith(
|
|
575
|
+
haplogroup1.rstrip("*")
|
|
576
|
+
)
|
|
577
|
+
elif "*" in haplogroup1:
|
|
578
|
+
return haplogroup2.startswith(haplogroup1.rstrip("*"))
|
|
579
|
+
else:
|
|
580
|
+
return haplogroup1.startswith(haplogroup2.rstrip("*"))
|
|
581
|
+
|
|
582
|
+
for parent, child in self.get_parent_child_pairs():
|
|
583
|
+
if self.get_data(parent)["sex"] == "F":
|
|
584
|
+
if not haplogroups_agree(self.get_data(parent)["mt_haplogroup"], self.get_data(child)["mt_haplogroup"]):
|
|
585
|
+
return False
|
|
586
|
+
elif self.get_data(parent)["sex"] == "M" and self.get_data(child)["sex"] == "M":
|
|
587
|
+
if not haplogroups_agree(self.get_data(parent)["y_haplogroup"], self.get_data(child)["y_haplogroup"]):
|
|
588
|
+
return False
|
|
589
|
+
return True
|
|
590
|
+
|
|
591
|
+
def validate_can_have_children(self) -> bool:
|
|
592
|
+
"""
|
|
593
|
+
Validates that nodes that cannot have children do not have children.
|
|
594
|
+
"""
|
|
595
|
+
for node in self.get_non_placeholder_nodes():
|
|
596
|
+
if len(self.get_children(node)) > 0 and not self.get_data(node)["can_have_children"]:
|
|
597
|
+
return False
|
|
598
|
+
return True
|
|
599
|
+
|
|
600
|
+
def validate_inbreeding(self) -> bool:
|
|
601
|
+
"""
|
|
602
|
+
Validates that nodes that are known to be not inbred are not inbred.
|
|
603
|
+
"""
|
|
604
|
+
related_pairs = self.get_related_pairs()
|
|
605
|
+
for node in self.get_non_placeholder_nodes():
|
|
606
|
+
if not self.get_data(node)["can_be_inbred"]:
|
|
607
|
+
father = self.get_father(node)
|
|
608
|
+
mother = self.get_mother(node)
|
|
609
|
+
if (father, mother) in related_pairs or (mother, father) in related_pairs:
|
|
610
|
+
return False
|
|
611
|
+
return True
|
|
612
|
+
|
|
613
|
+
def validate_years_before_present(self) -> bool:
|
|
614
|
+
"""
|
|
615
|
+
Validates that nodes do not postdate their descendants.
|
|
616
|
+
"""
|
|
617
|
+
leaf_nodes: list[str] = [node for node in self.node_to_data if not self.get_children(node)]
|
|
618
|
+
|
|
619
|
+
def dfs(node: str, curr_years_before_present: float) -> bool:
|
|
620
|
+
years_before_present = self.get_data(node)["years_before_present"]
|
|
621
|
+
if not math.isnan(years_before_present):
|
|
622
|
+
# Node postdates its descendants
|
|
623
|
+
if years_before_present < curr_years_before_present:
|
|
624
|
+
return False
|
|
625
|
+
else:
|
|
626
|
+
curr_years_before_present = years_before_present
|
|
627
|
+
|
|
628
|
+
if self.get_father(node):
|
|
629
|
+
assert self.get_mother(node)
|
|
630
|
+
if not dfs(self.get_father(node), curr_years_before_present):
|
|
631
|
+
return False
|
|
632
|
+
if not dfs(self.get_mother(node), curr_years_before_present):
|
|
633
|
+
return False
|
|
634
|
+
return True
|
|
635
|
+
|
|
636
|
+
for node in leaf_nodes:
|
|
637
|
+
if not dfs(node, float("-inf")):
|
|
638
|
+
return False
|
|
639
|
+
return True
|
|
640
|
+
|
|
641
|
+
def validate_forced_constraints(
|
|
642
|
+
self, pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]]
|
|
643
|
+
) -> bool:
|
|
644
|
+
"""
|
|
645
|
+
Validates that forced constraints so far are present in the pedigree.
|
|
646
|
+
Note: Additional relations between two nodes are allowed as long as the forced constraints are present.
|
|
647
|
+
"""
|
|
648
|
+
for (node1, node2), degree_constraints in pair_to_relations_so_far.items():
|
|
649
|
+
for _, constraints, force_constraints in degree_constraints:
|
|
650
|
+
if force_constraints and not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
|
|
651
|
+
return False
|
|
652
|
+
return True
|
|
653
|
+
|
|
654
|
+
def count_inconsistencies(
|
|
655
|
+
self,
|
|
656
|
+
pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]],
|
|
657
|
+
pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]],
|
|
658
|
+
check_half_siblings: bool,
|
|
659
|
+
) -> tuple[int, list[tuple[str, str, str, str]]]:
|
|
660
|
+
"""
|
|
661
|
+
Validates this tree based on the input relation data.
|
|
662
|
+
If check_half_siblings is False, don't check for extraneous half-sibling relations
|
|
663
|
+
because the 2 non-shared parents might be merged later.
|
|
664
|
+
Returns count of inconsistencies with the input data as well as a log of the inconsistencies.
|
|
665
|
+
Note: pair_to_constraints values must be sorted by increasing length
|
|
666
|
+
so that specific constraints are checked first.
|
|
667
|
+
"""
|
|
668
|
+
for node1, node2 in pair_to_constraints:
|
|
669
|
+
# Ensure no duplicate/symmetric entries
|
|
670
|
+
assert (node2, node1) not in pair_to_constraints
|
|
671
|
+
# Marks which entries in pair_to_constraints have been seen/used
|
|
672
|
+
pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
|
|
673
|
+
|
|
674
|
+
def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
|
|
675
|
+
if (node1, node2) in pair_to_constraints:
|
|
676
|
+
for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
|
|
677
|
+
if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
|
|
678
|
+
return True
|
|
679
|
+
return False
|
|
680
|
+
|
|
681
|
+
def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
|
|
682
|
+
if (node1, node2) in pair_to_constraints:
|
|
683
|
+
for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
|
|
684
|
+
if relation in constraints:
|
|
685
|
+
pair_to_constraints_seen_entries[(node1, node2)].add(idx)
|
|
686
|
+
break
|
|
687
|
+
|
|
688
|
+
def validate_relation(
|
|
689
|
+
node1: str, node2: str, relation: str, strike_log: list[tuple[str, str, str, str]]
|
|
690
|
+
) -> None:
|
|
691
|
+
relation_to_degree = {
|
|
692
|
+
"parent-child": "1",
|
|
693
|
+
"child-parent": "1",
|
|
694
|
+
"siblings": "1",
|
|
695
|
+
"maternal aunt/uncle-nephew/niece": "2",
|
|
696
|
+
"maternal nephew/niece-aunt/uncle": "2",
|
|
697
|
+
"paternal aunt/uncle-nephew/niece": "2",
|
|
698
|
+
"paternal nephew/niece-aunt/uncle": "2",
|
|
699
|
+
"maternal grandparent-grandchild": "2",
|
|
700
|
+
"maternal grandchild-grandparent": "2",
|
|
701
|
+
"paternal grandparent-grandchild": "2",
|
|
702
|
+
"paternal grandchild-grandparent": "2",
|
|
703
|
+
"maternal half-siblings": "2",
|
|
704
|
+
"paternal half-siblings": "2",
|
|
705
|
+
"double cousins": "2",
|
|
706
|
+
}
|
|
707
|
+
flipped_relations = {
|
|
708
|
+
"parent-child": "child-parent",
|
|
709
|
+
"child-parent": "parent-child",
|
|
710
|
+
"maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
|
|
711
|
+
"paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
|
|
712
|
+
"maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
|
|
713
|
+
"paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
|
|
714
|
+
"maternal grandparent-grandchild": "maternal grandchild-grandparent",
|
|
715
|
+
"paternal grandparent-grandchild": "paternal grandchild-grandparent",
|
|
716
|
+
"maternal grandchild-grandparent": "maternal grandparent-grandchild",
|
|
717
|
+
"paternal grandchild-grandparent": "paternal grandparent-grandchild",
|
|
718
|
+
"siblings": "siblings", # Symmetric
|
|
719
|
+
"maternal half-siblings": "maternal half-siblings", # Symmetric
|
|
720
|
+
"paternal half-siblings": "paternal half-siblings", # Symmetric
|
|
721
|
+
"double cousins": "double cousins", # Symmetric
|
|
722
|
+
}
|
|
723
|
+
if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
|
|
724
|
+
node2, node1, flipped_relations[relation]
|
|
725
|
+
):
|
|
726
|
+
if node1 < node2:
|
|
727
|
+
strike_log.append((node1, node2, f"+{relation_to_degree[relation]}", ""))
|
|
728
|
+
else:
|
|
729
|
+
strike_log.append((node2, node1, f"+{relation_to_degree[relation]}", ""))
|
|
730
|
+
remove_relation_from_input_data(node1, node2, relation)
|
|
731
|
+
remove_relation_from_input_data(node2, node1, flipped_relations[relation])
|
|
732
|
+
|
|
733
|
+
strike_log: list[tuple[str, str, str, str]] = [] # (node1, node2, +/- relation degree, constraints)
|
|
734
|
+
# Check that relations in the pedigree are present in the input data
|
|
735
|
+
for parent, child in self.get_parent_child_pairs(include_placeholders=False):
|
|
736
|
+
validate_relation(parent, child, "parent-child", strike_log)
|
|
737
|
+
for sibling1, sibling2 in self.get_sibling_pairs(include_placeholders=False):
|
|
738
|
+
validate_relation(sibling1, sibling2, "siblings", strike_log)
|
|
739
|
+
|
|
740
|
+
for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
|
|
741
|
+
include_placeholders=False, shared_relative_sex="F"
|
|
742
|
+
):
|
|
743
|
+
validate_relation(aunt_uncle, nephew_niece, "maternal aunt/uncle-nephew/niece", strike_log)
|
|
744
|
+
for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs(
|
|
745
|
+
include_placeholders=False, shared_relative_sex="M"
|
|
746
|
+
):
|
|
747
|
+
validate_relation(aunt_uncle, nephew_niece, "paternal aunt/uncle-nephew/niece", strike_log)
|
|
748
|
+
|
|
749
|
+
for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
|
|
750
|
+
include_placeholders=False, shared_relative_sex="F"
|
|
751
|
+
):
|
|
752
|
+
validate_relation(grandparent, grandchild, "maternal grandparent-grandchild", strike_log)
|
|
753
|
+
for grandparent, grandchild in self.get_grandparent_grandchild_pairs(
|
|
754
|
+
include_placeholders=False, shared_relative_sex="M"
|
|
755
|
+
):
|
|
756
|
+
validate_relation(grandparent, grandchild, "paternal grandparent-grandchild", strike_log)
|
|
757
|
+
for double_cousin1, double_cousin2 in self.get_double_cousin_pairs(include_placeholders=False):
|
|
758
|
+
validate_relation(double_cousin1, double_cousin2, "double cousins", strike_log)
|
|
759
|
+
|
|
760
|
+
if check_half_siblings:
|
|
761
|
+
for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
|
|
762
|
+
include_placeholders=False, shared_relative_sex="F"
|
|
763
|
+
):
|
|
764
|
+
validate_relation(half_sibling1, half_sibling2, "maternal half-siblings", strike_log)
|
|
765
|
+
for half_sibling1, half_sibling2 in self.get_half_sibling_pairs(
|
|
766
|
+
include_placeholders=False, shared_relative_sex="M"
|
|
767
|
+
):
|
|
768
|
+
validate_relation(half_sibling1, half_sibling2, "paternal half-siblings", strike_log)
|
|
769
|
+
|
|
770
|
+
# Check for "dropped" input relations
|
|
771
|
+
# Note: We use constrained relations instead of all relations because we want to catch half-siblings
|
|
772
|
+
# that explicitly should be some other relation even when check_half_siblings is False
|
|
773
|
+
# The purpose of check_half_siblings is to avoid marking *incidental* half-siblings,
|
|
774
|
+
# not half-siblings that should be something else
|
|
775
|
+
for (node1, node2), degrees_constraints in pair_to_relations_so_far.items():
|
|
776
|
+
# If only one input relation between these two nodes, simple check is much faster
|
|
777
|
+
if len(degrees_constraints) == 1:
|
|
778
|
+
degree, constraints, _ = degrees_constraints[0]
|
|
779
|
+
if not self.is_relation_in_pedigree(node1, node2, constraints.split(";")):
|
|
780
|
+
strike_log.append((node1, node2, f"-{degree}", constraints))
|
|
781
|
+
else:
|
|
782
|
+
pedigree_shared_relations: defaultdict[str, int] = self.get_relations_between_nodes(
|
|
783
|
+
node1, node2, include_maternal_paternal=True
|
|
784
|
+
)
|
|
785
|
+
for degree, constraints, _ in degrees_constraints:
|
|
786
|
+
present_flag = False
|
|
787
|
+
for constraint in constraints.split(";"):
|
|
788
|
+
if constraint in pedigree_shared_relations:
|
|
789
|
+
present_flag = True
|
|
790
|
+
pedigree_shared_relations[constraint] -= 1
|
|
791
|
+
if pedigree_shared_relations[constraint] == 0:
|
|
792
|
+
del pedigree_shared_relations[constraint]
|
|
793
|
+
break
|
|
794
|
+
if not present_flag:
|
|
795
|
+
strike_log.append((node1, node2, f"-{degree}", constraints))
|
|
796
|
+
|
|
797
|
+
# Count # of strikes (will not equal len(strike_log) because we don't want to double-count *changed* relations)
|
|
798
|
+
strike_count: int = 0
|
|
799
|
+
node_pair_strike_balances: defaultdict[tuple[str, str], int] = defaultdict(int)
|
|
800
|
+
for node1, node2, strike, _ in strike_log:
|
|
801
|
+
if strike[0] == "+":
|
|
802
|
+
if node_pair_strike_balances[(node1, node2)] >= 0:
|
|
803
|
+
strike_count += 1
|
|
804
|
+
node_pair_strike_balances[(node1, node2)] += 1
|
|
805
|
+
node_pair_strike_balances[(node2, node1)] += 1
|
|
806
|
+
|
|
807
|
+
elif strike[0] == "-":
|
|
808
|
+
if node_pair_strike_balances[(node1, node2)] <= 0:
|
|
809
|
+
strike_count += 1
|
|
810
|
+
node_pair_strike_balances[(node1, node2)] -= 1
|
|
811
|
+
node_pair_strike_balances[(node2, node1)] -= 1
|
|
812
|
+
return strike_count, strike_log
|
|
813
|
+
|
|
814
|
+
def count_third_degree_inconsistencies(
|
|
815
|
+
self, pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]]
|
|
816
|
+
) -> int:
|
|
817
|
+
"""
|
|
818
|
+
Counts only one-sided inconsistencies in third-degree relations.
|
|
819
|
+
Used as a "tie-breaker" for 1st- and 2nd-degree inconsistences.
|
|
820
|
+
"""
|
|
821
|
+
for node1, node2 in pair_to_constraints:
|
|
822
|
+
# Ensure no duplicate/symmetric entries
|
|
823
|
+
assert (node2, node1) not in pair_to_constraints
|
|
824
|
+
# Marks which entries in pair_to_constraints have been seen/used
|
|
825
|
+
pair_to_constraints_seen_entries: defaultdict[tuple[str, str], set[int]] = defaultdict(set)
|
|
826
|
+
|
|
827
|
+
def is_relation_in_input_data(node1: str, node2: str, relation: str) -> bool:
|
|
828
|
+
if (node1, node2) in pair_to_constraints:
|
|
829
|
+
for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
|
|
830
|
+
if idx not in pair_to_constraints_seen_entries[(node1, node2)] and relation in constraints:
|
|
831
|
+
return True
|
|
832
|
+
return False
|
|
833
|
+
|
|
834
|
+
def remove_relation_from_input_data(node1: str, node2: str, relation: str) -> None:
|
|
835
|
+
if (node1, node2) in pair_to_constraints:
|
|
836
|
+
for idx, constraints in enumerate(pair_to_constraints[(node1, node2)]):
|
|
837
|
+
if relation in constraints:
|
|
838
|
+
pair_to_constraints_seen_entries[(node1, node2)].add(idx)
|
|
839
|
+
break
|
|
840
|
+
|
|
841
|
+
def validate_relation(node1: str, node2: str, relation: str) -> bool:
|
|
842
|
+
flipped_relations = {
|
|
843
|
+
"half aunt/uncle-half nephew/niece": "half nephew/niece-half aunt/uncle",
|
|
844
|
+
"half nephew/niece-half aunt/uncle": "half aunt/uncle-half nephew/niece",
|
|
845
|
+
"greatgrandparent-greatgrandchild": "greatgrandchild-greatgrandparent",
|
|
846
|
+
"greatgrandchild-greatgrandparent": "greatgrandparent-greatgrandchild",
|
|
847
|
+
"grandaunt/granduncle-grandnephew/grandniece": "grandnephew/grandniece-grandaunt/granduncle",
|
|
848
|
+
"grandnephew/grandniece-grandaunt/granduncle": "grandaunt/granduncle-grandnephew/grandniece",
|
|
849
|
+
"first cousins": "first cousins", # Symmetric
|
|
850
|
+
}
|
|
851
|
+
ret = False
|
|
852
|
+
if not is_relation_in_input_data(node1, node2, relation) and not is_relation_in_input_data(
|
|
853
|
+
node2, node1, flipped_relations[relation]
|
|
854
|
+
):
|
|
855
|
+
ret = True
|
|
856
|
+
remove_relation_from_input_data(node1, node2, relation)
|
|
857
|
+
remove_relation_from_input_data(node2, node1, flipped_relations[relation])
|
|
858
|
+
return ret
|
|
859
|
+
|
|
860
|
+
# Double cousins are also twice-first cousins, so don't count the first cousin relations separately
|
|
861
|
+
accounted_cousin_pairs: defaultdict[tuple[str, str], int] = defaultdict(int)
|
|
862
|
+
for double_cousin1, double_cousin2 in self.get_double_cousin_pairs(include_placeholders=False):
|
|
863
|
+
accounted_cousin_pairs[(double_cousin1, double_cousin2)] += 2
|
|
864
|
+
accounted_cousin_pairs[(double_cousin2, double_cousin1)] += 2
|
|
865
|
+
|
|
866
|
+
strike_count: int = 0
|
|
867
|
+
for half_aunt_uncle, half_nephew_niece in self.get_half_aunt_uncle_nephew_niece_pairs(
|
|
868
|
+
include_placeholders=False
|
|
869
|
+
):
|
|
870
|
+
strike_count += validate_relation(half_aunt_uncle, half_nephew_niece, "half aunt/uncle-half nephew/niece")
|
|
871
|
+
for greatgrandparent, greatgrandchild in self.get_greatgrandparent_greatgrandchild_pairs(
|
|
872
|
+
include_placeholders=False
|
|
873
|
+
):
|
|
874
|
+
strike_count += validate_relation(greatgrandparent, greatgrandchild, "greatgrandparent-greatgrandchild")
|
|
875
|
+
for grandaunt_granduncle, grandnephew_grandniece in self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(
|
|
876
|
+
include_placeholders=False
|
|
877
|
+
):
|
|
878
|
+
strike_count += validate_relation(
|
|
879
|
+
grandaunt_granduncle, grandnephew_grandniece, "grandaunt/granduncle-grandnephew/grandniece"
|
|
880
|
+
)
|
|
881
|
+
for first_cousin1, first_cousin2 in self.get_first_cousin_pairs(include_placeholders=False):
|
|
882
|
+
# Only count cousin relation if not already accounted for by a double cousin relation
|
|
883
|
+
if accounted_cousin_pairs[(first_cousin1, first_cousin2)] > 0:
|
|
884
|
+
accounted_cousin_pairs[(first_cousin1, first_cousin2)] -= 1
|
|
885
|
+
accounted_cousin_pairs[(first_cousin2, first_cousin1)] -= 1
|
|
886
|
+
else:
|
|
887
|
+
strike_count += validate_relation(first_cousin1, first_cousin2, "first cousins")
|
|
888
|
+
return strike_count
|
|
889
|
+
|
|
890
|
+
def is_relation_in_pedigree(self, node1: str, node2: str, relations_list: list[str]) -> bool:
|
|
891
|
+
"""
|
|
892
|
+
Returns True if *any* of the relations in relations_list are present between node1 and node2 in the pedigree.
|
|
893
|
+
"""
|
|
894
|
+
assert node1 in self.node_to_data and node2 in self.node_to_data
|
|
895
|
+
|
|
896
|
+
for relation in relations_list:
|
|
897
|
+
if relation == "parent-child":
|
|
898
|
+
if node2 in self.get_children(node1):
|
|
899
|
+
return True
|
|
900
|
+
if relation == "child-parent":
|
|
901
|
+
if node1 in self.get_children(node2):
|
|
902
|
+
return True
|
|
903
|
+
if relation == "siblings":
|
|
904
|
+
if node2 in self.get_siblings(node1):
|
|
905
|
+
assert node1 in self.get_siblings(node2)
|
|
906
|
+
return True
|
|
907
|
+
|
|
908
|
+
if relation == "aunt/uncle-nephew/niece":
|
|
909
|
+
for sibling in self.get_siblings(node1):
|
|
910
|
+
if node2 in self.get_children(sibling):
|
|
911
|
+
return True
|
|
912
|
+
if relation == "nephew/niece-aunt/uncle":
|
|
913
|
+
for sibling in self.get_siblings(node2):
|
|
914
|
+
if node1 in self.get_children(sibling):
|
|
915
|
+
return True
|
|
916
|
+
if relation == "grandparent-grandchild":
|
|
917
|
+
for child in self.get_children(node1):
|
|
918
|
+
if node2 in self.get_children(child):
|
|
919
|
+
return True
|
|
920
|
+
if relation == "grandchild-grandparent":
|
|
921
|
+
for child in self.get_children(node2):
|
|
922
|
+
if node1 in self.get_children(child):
|
|
923
|
+
return True
|
|
924
|
+
if relation == "half-siblings":
|
|
925
|
+
if self.get_father(node2):
|
|
926
|
+
if node1 in self.get_children(self.get_father(node2)) and self.get_mother(node1) != self.get_mother(
|
|
927
|
+
node2
|
|
928
|
+
):
|
|
929
|
+
return True
|
|
930
|
+
if self.get_mother(node2):
|
|
931
|
+
if node1 in self.get_children(self.get_mother(node2)) and self.get_father(node1) != self.get_father(
|
|
932
|
+
node2
|
|
933
|
+
):
|
|
934
|
+
return True
|
|
935
|
+
if relation == "double cousins":
|
|
936
|
+
father1 = self.get_father(node1)
|
|
937
|
+
mother1 = self.get_mother(node1)
|
|
938
|
+
father2 = self.get_father(node2)
|
|
939
|
+
mother2 = self.get_mother(node2)
|
|
940
|
+
if not father1 or not mother1 or not father2 or not mother2:
|
|
941
|
+
continue
|
|
942
|
+
fathers_are_siblings = father2 in self.get_siblings(father1)
|
|
943
|
+
mothers_are_siblings = mother1 in self.get_siblings(mother2)
|
|
944
|
+
cross_parents_are_siblings = father2 in self.get_siblings(mother1) and father1 in self.get_siblings(
|
|
945
|
+
mother2
|
|
946
|
+
)
|
|
947
|
+
if (fathers_are_siblings and mothers_are_siblings) or cross_parents_are_siblings:
|
|
948
|
+
return True
|
|
949
|
+
|
|
950
|
+
if relation == "maternal aunt/uncle-nephew/niece":
|
|
951
|
+
for sibling in self.get_siblings(node1):
|
|
952
|
+
if self.get_data(sibling)["sex"] == "F" and node2 in self.get_children(sibling):
|
|
953
|
+
return True
|
|
954
|
+
if relation == "paternal aunt/uncle-nephew/niece":
|
|
955
|
+
for sibling in self.get_siblings(node1):
|
|
956
|
+
if self.get_data(sibling)["sex"] == "M" and node2 in self.get_children(sibling):
|
|
957
|
+
return True
|
|
958
|
+
if relation == "maternal nephew/niece-aunt/uncle":
|
|
959
|
+
for sibling in self.get_siblings(node2):
|
|
960
|
+
if self.get_data(sibling)["sex"] == "F" and node1 in self.get_children(sibling):
|
|
961
|
+
return True
|
|
962
|
+
if relation == "paternal nephew/niece-aunt/uncle":
|
|
963
|
+
for sibling in self.get_siblings(node2):
|
|
964
|
+
if self.get_data(sibling)["sex"] == "M" and node1 in self.get_children(sibling):
|
|
965
|
+
return True
|
|
966
|
+
|
|
967
|
+
if relation == "maternal grandparent-grandchild":
|
|
968
|
+
for child in self.get_children(node1):
|
|
969
|
+
if self.get_data(child)["sex"] == "F" and node2 in self.get_children(child):
|
|
970
|
+
return True
|
|
971
|
+
if relation == "paternal grandparent-grandchild":
|
|
972
|
+
for child in self.get_children(node1):
|
|
973
|
+
if self.get_data(child)["sex"] == "M" and node2 in self.get_children(child):
|
|
974
|
+
return True
|
|
975
|
+
if relation == "maternal grandchild-grandparent":
|
|
976
|
+
for child in self.get_children(node2):
|
|
977
|
+
if self.get_data(child)["sex"] == "F" and node1 in self.get_children(child):
|
|
978
|
+
return True
|
|
979
|
+
if relation == "paternal grandchild-grandparent":
|
|
980
|
+
for child in self.get_children(node2):
|
|
981
|
+
if self.get_data(child)["sex"] == "M" and node1 in self.get_children(child):
|
|
982
|
+
return True
|
|
983
|
+
|
|
984
|
+
if relation == "paternal half-siblings":
|
|
985
|
+
if self.get_father(node2):
|
|
986
|
+
if node1 in self.get_children(self.get_father(node2)) and self.get_mother(node1) != self.get_mother(
|
|
987
|
+
node2
|
|
988
|
+
):
|
|
989
|
+
return True
|
|
990
|
+
if relation == "maternal half-siblings":
|
|
991
|
+
if self.get_mother(node2):
|
|
992
|
+
if node1 in self.get_children(self.get_mother(node2)) and self.get_father(node1) != self.get_father(
|
|
993
|
+
node2
|
|
994
|
+
):
|
|
995
|
+
return True
|
|
996
|
+
return False
|
|
997
|
+
|
|
998
|
+
def get_relations_between_nodes(
|
|
999
|
+
self, node1: str, node2: str, include_maternal_paternal: bool = False
|
|
1000
|
+
) -> defaultdict[str, int]:
|
|
1001
|
+
"""
|
|
1002
|
+
Returns a dictionary of the *1st- and 2nd-degree* relations between node1 and node2.
|
|
1003
|
+
"""
|
|
1004
|
+
relations: defaultdict[str, int] = defaultdict(int)
|
|
1005
|
+
if self.is_relation_in_pedigree(node1, node2, ["parent-child"]):
|
|
1006
|
+
relations["parent-child"] += 1
|
|
1007
|
+
if self.is_relation_in_pedigree(node1, node2, ["child-parent"]):
|
|
1008
|
+
relations["child-parent"] += 1
|
|
1009
|
+
if self.is_relation_in_pedigree(node1, node2, ["siblings"]):
|
|
1010
|
+
relations["siblings"] += 1
|
|
1011
|
+
|
|
1012
|
+
if self.is_relation_in_pedigree(node1, node2, ["maternal aunt/uncle-nephew/niece"]):
|
|
1013
|
+
relations["maternal aunt/uncle-nephew/niece"] += 1
|
|
1014
|
+
if self.is_relation_in_pedigree(node1, node2, ["paternal aunt/uncle-nephew/niece"]):
|
|
1015
|
+
relations["paternal aunt/uncle-nephew/niece"] += 1
|
|
1016
|
+
if self.is_relation_in_pedigree(node1, node2, ["maternal nephew/niece-aunt/uncle"]):
|
|
1017
|
+
relations["maternal nephew/niece-aunt/uncle"] += 1
|
|
1018
|
+
if self.is_relation_in_pedigree(node1, node2, ["paternal nephew/niece-aunt/uncle"]):
|
|
1019
|
+
relations["paternal nephew/niece-aunt/uncle"] += 1
|
|
1020
|
+
|
|
1021
|
+
if self.is_relation_in_pedigree(node1, node2, ["maternal grandparent-grandchild"]):
|
|
1022
|
+
relations["maternal grandparent-grandchild"] += 1
|
|
1023
|
+
if self.is_relation_in_pedigree(node1, node2, ["paternal grandparent-grandchild"]):
|
|
1024
|
+
relations["paternal grandparent-grandchild"] += 1
|
|
1025
|
+
if self.is_relation_in_pedigree(node1, node2, ["maternal grandchild-grandparent"]):
|
|
1026
|
+
relations["maternal grandchild-grandparent"] += 1
|
|
1027
|
+
if self.is_relation_in_pedigree(node1, node2, ["paternal grandchild-grandparent"]):
|
|
1028
|
+
relations["paternal grandchild-grandparent"] += 1
|
|
1029
|
+
|
|
1030
|
+
if self.is_relation_in_pedigree(node1, node2, ["maternal half-siblings"]):
|
|
1031
|
+
relations["maternal half-siblings"] += 1
|
|
1032
|
+
if self.is_relation_in_pedigree(node1, node2, ["paternal half-siblings"]):
|
|
1033
|
+
relations["paternal half-siblings"] += 1
|
|
1034
|
+
|
|
1035
|
+
if self.is_relation_in_pedigree(node1, node2, ["double cousins"]):
|
|
1036
|
+
relations["double cousins"] += 1
|
|
1037
|
+
|
|
1038
|
+
if not include_maternal_paternal:
|
|
1039
|
+
relations["aunt/uncle-nephew/niece"] = (
|
|
1040
|
+
relations["maternal aunt/uncle-nephew/niece"] + relations["paternal aunt/uncle-nephew/niece"]
|
|
1041
|
+
)
|
|
1042
|
+
relations["nephew/niece-aunt/uncle"] = (
|
|
1043
|
+
relations["maternal nephew/niece-aunt/uncle"] + relations["paternal nephew/niece-aunt/uncle"]
|
|
1044
|
+
)
|
|
1045
|
+
relations["grandparent-grandchild"] = (
|
|
1046
|
+
relations["maternal grandparent-grandchild"] + relations["paternal grandparent-grandchild"]
|
|
1047
|
+
)
|
|
1048
|
+
relations["grandchild-grandparent"] = (
|
|
1049
|
+
relations["maternal grandchild-grandparent"] + relations["paternal grandchild-grandparent"]
|
|
1050
|
+
)
|
|
1051
|
+
relations["half-siblings"] = relations["maternal half-siblings"] + relations["paternal half-siblings"]
|
|
1052
|
+
for relation in [
|
|
1053
|
+
"maternal aunt/uncle-nephew/niece",
|
|
1054
|
+
"paternal aunt/uncle-nephew/niece",
|
|
1055
|
+
"maternal nephew/niece-aunt/uncle",
|
|
1056
|
+
"paternal nephew/niece-aunt/uncle",
|
|
1057
|
+
"maternal grandparent-grandchild",
|
|
1058
|
+
"paternal grandparent-grandchild",
|
|
1059
|
+
"maternal grandchild-grandparent",
|
|
1060
|
+
"paternal grandchild-grandparent",
|
|
1061
|
+
"maternal half-siblings",
|
|
1062
|
+
"paternal half-siblings",
|
|
1063
|
+
]:
|
|
1064
|
+
del relations[relation]
|
|
1065
|
+
|
|
1066
|
+
relations_to_remove = set()
|
|
1067
|
+
for relation, count in relations.items():
|
|
1068
|
+
if count == 0:
|
|
1069
|
+
relations_to_remove.add(relation)
|
|
1070
|
+
for relation in relations_to_remove:
|
|
1071
|
+
del relations[relation]
|
|
1072
|
+
return relations
|
|
1073
|
+
|
|
1074
|
+
def get_parent_child_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1075
|
+
"""
|
|
1076
|
+
Gets all (parent, child) pairs in the tree.
|
|
1077
|
+
"""
|
|
1078
|
+
parent_child_pairs: list[tuple[str, str]] = []
|
|
1079
|
+
for parent in self.node_to_children:
|
|
1080
|
+
for child in self.get_children(parent):
|
|
1081
|
+
if include_placeholders or (not parent.isnumeric() and not child.isnumeric()):
|
|
1082
|
+
parent_child_pairs.append((parent, child))
|
|
1083
|
+
return parent_child_pairs
|
|
1084
|
+
|
|
1085
|
+
def get_sibling_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1086
|
+
"""
|
|
1087
|
+
Gets all (sibling, sibling) pairs in the tree.
|
|
1088
|
+
Note: Only gets *full* siblings. See self.get_half_sibling_pairs().
|
|
1089
|
+
"""
|
|
1090
|
+
sibling_pairs: list[tuple[str, str]] = []
|
|
1091
|
+
for sibling1 in self.node_to_siblings:
|
|
1092
|
+
for sibling2 in self.get_siblings(sibling1):
|
|
1093
|
+
if include_placeholders or (not sibling1.isnumeric() and not sibling2.isnumeric()):
|
|
1094
|
+
# Don't add symmetric duplicates
|
|
1095
|
+
if (sibling2, sibling1) not in sibling_pairs:
|
|
1096
|
+
sibling_pairs.append((sibling1, sibling2))
|
|
1097
|
+
return sibling_pairs
|
|
1098
|
+
|
|
1099
|
+
def get_aunt_uncle_nephew_niece_pairs(
|
|
1100
|
+
self, include_placeholders: bool = True, shared_relative_sex: str | None = None
|
|
1101
|
+
) -> list[tuple[str, str]]:
|
|
1102
|
+
"""
|
|
1103
|
+
Gets all (aunt/uncle, nephew/niece) pairs in the tree.
|
|
1104
|
+
Includes duplicates if, for example, shared_relative_sex=None and an aunt is
|
|
1105
|
+
both a maternal and paternal aunt to a nephew (i.e., full-sib mating).
|
|
1106
|
+
"""
|
|
1107
|
+
aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
|
|
1108
|
+
for parent, child in self.get_parent_child_pairs():
|
|
1109
|
+
for parent_sibling in self.get_siblings(parent):
|
|
1110
|
+
if not shared_relative_sex or self.get_data(parent)["sex"] == shared_relative_sex:
|
|
1111
|
+
if include_placeholders or (not parent_sibling.isnumeric() and not child.isnumeric()):
|
|
1112
|
+
aunt_uncle_nephew_niece_pairs.append((parent_sibling, child))
|
|
1113
|
+
return aunt_uncle_nephew_niece_pairs
|
|
1114
|
+
|
|
1115
|
+
def get_grandparent_grandchild_pairs(
|
|
1116
|
+
self, include_placeholders: bool = True, shared_relative_sex: str | None = None
|
|
1117
|
+
) -> list[tuple[str, str]]:
|
|
1118
|
+
"""
|
|
1119
|
+
Gets all (grandparent, grandchild) pairs in the tree.
|
|
1120
|
+
Includes duplicates if, for example, a grandparent is both a maternal and paternal grandparent to a grandchild.
|
|
1121
|
+
"""
|
|
1122
|
+
grandparent_grandchild_pairs: list[tuple[str, str]] = []
|
|
1123
|
+
for parent, child in self.get_parent_child_pairs():
|
|
1124
|
+
for child_child in self.get_children(child):
|
|
1125
|
+
if not shared_relative_sex or self.get_data(child)["sex"] == shared_relative_sex:
|
|
1126
|
+
if include_placeholders or (not parent.isnumeric() and not child_child.isnumeric()):
|
|
1127
|
+
grandparent_grandchild_pairs.append((parent, child_child))
|
|
1128
|
+
return grandparent_grandchild_pairs
|
|
1129
|
+
|
|
1130
|
+
def get_half_sibling_pairs(
|
|
1131
|
+
self, include_placeholders: bool = True, shared_relative_sex: str | None = None
|
|
1132
|
+
) -> list[tuple[str, str]]:
|
|
1133
|
+
"""
|
|
1134
|
+
Gets all (half-sibling, half-sibling) pairs in the tree.
|
|
1135
|
+
"""
|
|
1136
|
+
half_sibling_pairs: list[tuple[str, str]] = []
|
|
1137
|
+
for parent, child in self.get_parent_child_pairs():
|
|
1138
|
+
for other_child in self.get_children(parent):
|
|
1139
|
+
if child != other_child and other_child not in self.get_siblings(child):
|
|
1140
|
+
if not shared_relative_sex or self.get_data(parent)["sex"] == shared_relative_sex:
|
|
1141
|
+
if include_placeholders or (not child.isnumeric() and not other_child.isnumeric()):
|
|
1142
|
+
# Don't add symmetric duplicates
|
|
1143
|
+
if (other_child, child) not in half_sibling_pairs:
|
|
1144
|
+
half_sibling_pairs.append((child, other_child))
|
|
1145
|
+
return half_sibling_pairs
|
|
1146
|
+
|
|
1147
|
+
def get_double_cousin_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1148
|
+
"""
|
|
1149
|
+
Gets all (double cousin, double cousin) pairs in the tree.
|
|
1150
|
+
"""
|
|
1151
|
+
double_cousin_pairs: list[tuple[str, str]] = []
|
|
1152
|
+
for cousin1, cousin2 in self.get_first_cousin_pairs(include_placeholders=include_placeholders):
|
|
1153
|
+
father1, mother1 = self.get_father(cousin1), self.get_mother(cousin1)
|
|
1154
|
+
father2, mother2 = self.get_father(cousin2), self.get_mother(cousin2)
|
|
1155
|
+
# Need both parents to be known to determine double cousins
|
|
1156
|
+
if not father1 or not mother1 or not father2 or not mother2:
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
fathers_are_siblings = father2 in self.get_siblings(father1)
|
|
1160
|
+
mothers_are_siblings = mother2 in self.get_siblings(mother1)
|
|
1161
|
+
cross_parent_siblings = mother2 in self.get_siblings(father1) and father2 in self.get_siblings(mother1)
|
|
1162
|
+
|
|
1163
|
+
if fathers_are_siblings and mothers_are_siblings:
|
|
1164
|
+
if (cousin2, cousin1) not in double_cousin_pairs:
|
|
1165
|
+
double_cousin_pairs.append((cousin1, cousin2))
|
|
1166
|
+
|
|
1167
|
+
if cross_parent_siblings:
|
|
1168
|
+
if (cousin2, cousin1) not in double_cousin_pairs:
|
|
1169
|
+
double_cousin_pairs.append((cousin1, cousin2))
|
|
1170
|
+
return double_cousin_pairs
|
|
1171
|
+
|
|
1172
|
+
def get_half_aunt_uncle_nephew_niece_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1173
|
+
"""
|
|
1174
|
+
Gets all (half-aunt/half-uncle, half-nephew/half-niece) pairs in the tree.
|
|
1175
|
+
"""
|
|
1176
|
+
half_aunt_uncle_nephew_niece_pairs: list[tuple[str, str]] = []
|
|
1177
|
+
for half_sibling1, half_sibling2 in self.get_half_sibling_pairs():
|
|
1178
|
+
for half_sibling1_child in self.get_children(half_sibling1):
|
|
1179
|
+
if half_sibling1_child != half_sibling2:
|
|
1180
|
+
if include_placeholders or (not half_sibling2.isnumeric() and not half_sibling1_child.isnumeric()):
|
|
1181
|
+
half_aunt_uncle_nephew_niece_pairs.append((half_sibling2, half_sibling1_child))
|
|
1182
|
+
|
|
1183
|
+
for half_sibling2_child in self.get_children(half_sibling2):
|
|
1184
|
+
if half_sibling2_child != half_sibling1:
|
|
1185
|
+
if include_placeholders or (not half_sibling1.isnumeric() and not half_sibling2_child.isnumeric()):
|
|
1186
|
+
half_aunt_uncle_nephew_niece_pairs.append((half_sibling1, half_sibling2_child))
|
|
1187
|
+
return half_aunt_uncle_nephew_niece_pairs
|
|
1188
|
+
|
|
1189
|
+
def get_greatgrandparent_greatgrandchild_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1190
|
+
"""
|
|
1191
|
+
Gets all (greatgrandparent, greatgrandchild) pairs in the tree.
|
|
1192
|
+
"""
|
|
1193
|
+
greatgrandparent_greatgrandchild_pairs: list[tuple[str, str]] = []
|
|
1194
|
+
for grandparent, grandchild in self.get_grandparent_grandchild_pairs():
|
|
1195
|
+
for grandchild_child in self.get_children(grandchild):
|
|
1196
|
+
if include_placeholders or (not grandparent.isnumeric() and not grandchild_child.isnumeric()):
|
|
1197
|
+
greatgrandparent_greatgrandchild_pairs.append((grandparent, grandchild_child))
|
|
1198
|
+
return greatgrandparent_greatgrandchild_pairs
|
|
1199
|
+
|
|
1200
|
+
def get_grandaunt_granduncle_grandnephew_grandniece_pairs(
|
|
1201
|
+
self, include_placeholders: bool = True
|
|
1202
|
+
) -> list[tuple[str, str]]:
|
|
1203
|
+
"""
|
|
1204
|
+
Gets all (grandaunt/uncle, grandnephew/niece) pairs in the tree.
|
|
1205
|
+
"""
|
|
1206
|
+
grandaunt_granduncle_grandnephew_grandniece_pairs: list[tuple[str, str]] = []
|
|
1207
|
+
for aunt_uncle, nephew_niece in self.get_aunt_uncle_nephew_niece_pairs():
|
|
1208
|
+
for nephew_niece_child in self.get_children(nephew_niece):
|
|
1209
|
+
if include_placeholders or (not aunt_uncle.isnumeric() and not nephew_niece_child.isnumeric()):
|
|
1210
|
+
grandaunt_granduncle_grandnephew_grandniece_pairs.append((aunt_uncle, nephew_niece_child))
|
|
1211
|
+
return grandaunt_granduncle_grandnephew_grandniece_pairs
|
|
1212
|
+
|
|
1213
|
+
def get_first_cousin_pairs(self, include_placeholders: bool = True) -> list[tuple[str, str]]:
|
|
1214
|
+
"""
|
|
1215
|
+
Gets all (first cousin, first cousin) pairs in the tree.
|
|
1216
|
+
"""
|
|
1217
|
+
cousin_pairs: list[tuple[str, str]] = []
|
|
1218
|
+
for aunt_uncle, child in self.get_aunt_uncle_nephew_niece_pairs():
|
|
1219
|
+
for aunt_uncle_child in self.get_children(aunt_uncle):
|
|
1220
|
+
if include_placeholders or (not child.isnumeric() and not aunt_uncle_child.isnumeric()):
|
|
1221
|
+
# Don't add symmetric duplicates
|
|
1222
|
+
if aunt_uncle_child != child and (aunt_uncle_child, child) not in cousin_pairs:
|
|
1223
|
+
cousin_pairs.append((child, aunt_uncle_child))
|
|
1224
|
+
return cousin_pairs
|
|
1225
|
+
|
|
1226
|
+
def get_related_pairs(self, include_placeholders: bool = True) -> set[tuple[str, str]]:
|
|
1227
|
+
"""
|
|
1228
|
+
Gets all related pairs (up to and including 3rd-degree relations) in the pedigree.
|
|
1229
|
+
"""
|
|
1230
|
+
related_pairs: set[tuple[str, str]] = set()
|
|
1231
|
+
related_pairs.update(self.get_parent_child_pairs(include_placeholders=include_placeholders))
|
|
1232
|
+
related_pairs.update(self.get_sibling_pairs(include_placeholders=include_placeholders))
|
|
1233
|
+
related_pairs.update(self.get_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
|
|
1234
|
+
related_pairs.update(self.get_grandparent_grandchild_pairs(include_placeholders=include_placeholders))
|
|
1235
|
+
related_pairs.update(self.get_half_sibling_pairs(include_placeholders=include_placeholders))
|
|
1236
|
+
related_pairs.update(self.get_half_aunt_uncle_nephew_niece_pairs(include_placeholders=include_placeholders))
|
|
1237
|
+
related_pairs.update(self.get_greatgrandparent_greatgrandchild_pairs(include_placeholders=include_placeholders))
|
|
1238
|
+
related_pairs.update(
|
|
1239
|
+
self.get_grandaunt_granduncle_grandnephew_grandniece_pairs(include_placeholders=include_placeholders)
|
|
1240
|
+
)
|
|
1241
|
+
related_pairs.update(self.get_first_cousin_pairs(include_placeholders=include_placeholders))
|
|
1242
|
+
return related_pairs
|
|
1243
|
+
|
|
1244
|
+
def get_non_placeholder_nodes(self) -> set[str]:
|
|
1245
|
+
"""
|
|
1246
|
+
Gets all non-placeholder nodes in the tree.
|
|
1247
|
+
"""
|
|
1248
|
+
return set([node for node in self.node_to_data if not node.isnumeric()])
|
|
1249
|
+
|
|
1250
|
+
def clean_data(self) -> None:
|
|
1251
|
+
"""
|
|
1252
|
+
Remove any empty entries in the relation dictionaries.
|
|
1253
|
+
Also remove unnecessary placeholder nodes to standardize topological sort output.
|
|
1254
|
+
"""
|
|
1255
|
+
placeholder_nodes_to_remove: set[str] = set()
|
|
1256
|
+
for node in self.node_to_data:
|
|
1257
|
+
mother = self.get_mother(node)
|
|
1258
|
+
father = self.get_father(node)
|
|
1259
|
+
if mother.isnumeric() and father.isnumeric():
|
|
1260
|
+
if len(self.get_children(mother)) == 1 and len(self.get_children(father)) == 1:
|
|
1261
|
+
if (
|
|
1262
|
+
not self.get_mother(mother)
|
|
1263
|
+
and not self.get_father(mother)
|
|
1264
|
+
and not self.get_mother(father)
|
|
1265
|
+
and not self.get_father(father)
|
|
1266
|
+
):
|
|
1267
|
+
placeholder_nodes_to_remove.add(mother)
|
|
1268
|
+
placeholder_nodes_to_remove.add(father)
|
|
1269
|
+
|
|
1270
|
+
for node in placeholder_nodes_to_remove:
|
|
1271
|
+
for data_dict in [
|
|
1272
|
+
self.node_to_data,
|
|
1273
|
+
self.node_to_father,
|
|
1274
|
+
self.node_to_mother,
|
|
1275
|
+
self.node_to_children,
|
|
1276
|
+
self.node_to_siblings,
|
|
1277
|
+
]:
|
|
1278
|
+
if node in data_dict:
|
|
1279
|
+
del data_dict[node]
|
|
1280
|
+
|
|
1281
|
+
for node in self.node_to_data:
|
|
1282
|
+
assert node not in self.get_siblings(node) and node not in self.get_children(node)
|
|
1283
|
+
if self.get_father(node) in placeholder_nodes_to_remove:
|
|
1284
|
+
del self.node_to_father[node]
|
|
1285
|
+
if self.get_mother(node) in placeholder_nodes_to_remove:
|
|
1286
|
+
del self.node_to_mother[node]
|
|
1287
|
+
|
|
1288
|
+
for relation_dict in [self.node_to_father, self.node_to_mother, self.node_to_children, self.node_to_siblings]:
|
|
1289
|
+
keys_to_remove = set()
|
|
1290
|
+
for k, v in relation_dict.items():
|
|
1291
|
+
if not v:
|
|
1292
|
+
keys_to_remove.add(k)
|
|
1293
|
+
for key_to_remove in keys_to_remove:
|
|
1294
|
+
del relation_dict[key_to_remove]
|
|
1295
|
+
|
|
1296
|
+
def plot(
|
|
1297
|
+
self,
|
|
1298
|
+
path: str,
|
|
1299
|
+
mt_haplogroup_to_color: dict[str, str] | dict[str, tuple[float, float, float, float]] | None = None,
|
|
1300
|
+
nodes_to_remove: list[str] | None = None,
|
|
1301
|
+
edges_to_remove: list[tuple[str, str]] | None = None,
|
|
1302
|
+
dotted_edges_to_add: list[tuple[str, str]] | None = None,
|
|
1303
|
+
plot_haplogroups: bool = True,
|
|
1304
|
+
font_size: float | None = None,
|
|
1305
|
+
) -> None:
|
|
1306
|
+
"""
|
|
1307
|
+
Plot the pedigree to the given path. Optionally takes a custom mapping of mt_haplogroups to colors.
|
|
1308
|
+
Also optionally takes arguments to plot uncertain relations.
|
|
1309
|
+
nodes_to_remove is a list of nodes to remove from the plot.
|
|
1310
|
+
edges_to_remove is a list of parent-child edges to remove from the plot.
|
|
1311
|
+
dotted_edges_to_add is a list of node pairs to plot as dotted lines.
|
|
1312
|
+
These arguments can be used in conjunction to replace uncertain relations with dotted lines.
|
|
1313
|
+
"""
|
|
1314
|
+
if not importlib.util.find_spec("pygraphviz"):
|
|
1315
|
+
raise ImportError("Plotting pedigree requires PyGraphviz (https://pygraphviz.github.io/).")
|
|
1316
|
+
|
|
1317
|
+
tree = nx.from_dict_of_lists(self.node_to_children, create_using=nx.DiGraph)
|
|
1318
|
+
# Add childless nodes
|
|
1319
|
+
for node in self.node_to_data:
|
|
1320
|
+
if node not in tree.nodes:
|
|
1321
|
+
tree.add_node(node)
|
|
1322
|
+
|
|
1323
|
+
# Replace relations with dotted edges
|
|
1324
|
+
if nodes_to_remove:
|
|
1325
|
+
tree.remove_nodes_from(nodes_to_remove)
|
|
1326
|
+
if edges_to_remove:
|
|
1327
|
+
tree.remove_edges_from(edges_to_remove)
|
|
1328
|
+
if dotted_edges_to_add:
|
|
1329
|
+
tree.add_edges_from(dotted_edges_to_add, style="dotted")
|
|
1330
|
+
parent_child_edges = [
|
|
1331
|
+
(u, v) for u, v, style in tree.edges.data("style", default="parent_child") if style == "parent_child"
|
|
1332
|
+
]
|
|
1333
|
+
dotted_edges = [(u, v) for u, v, style in tree.edges.data("style", default="parent_child") if style == "dotted"]
|
|
1334
|
+
|
|
1335
|
+
male_named_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "M" and not node.isnumeric()]
|
|
1336
|
+
male_placeholder_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "M" and node.isnumeric()]
|
|
1337
|
+
female_named_nodes = [node for node in tree.nodes if self.get_data(node)["sex"] == "F" and not node.isnumeric()]
|
|
1338
|
+
female_placeholder_nodes = [
|
|
1339
|
+
node for node in tree.nodes if self.get_data(node)["sex"] == "F" and node.isnumeric()
|
|
1340
|
+
]
|
|
1341
|
+
|
|
1342
|
+
node_labels = dict()
|
|
1343
|
+
for node in tree.nodes:
|
|
1344
|
+
mt_haplogroup = self.get_data(node)["mt_haplogroup"].replace("*", "")[:3]
|
|
1345
|
+
y_haplogroup = self.get_data(node)["y_haplogroup"].replace("*", "")[:3]
|
|
1346
|
+
if node.isnumeric():
|
|
1347
|
+
if not plot_haplogroups:
|
|
1348
|
+
node_labels[node] = ""
|
|
1349
|
+
elif y_haplogroup:
|
|
1350
|
+
node_labels[node] = f"MT: {mt_haplogroup}\nY: {y_haplogroup}"
|
|
1351
|
+
else:
|
|
1352
|
+
node_labels[node] = f"MT: {mt_haplogroup}"
|
|
1353
|
+
else:
|
|
1354
|
+
if not plot_haplogroups:
|
|
1355
|
+
node_labels[node] = node
|
|
1356
|
+
elif y_haplogroup:
|
|
1357
|
+
node_labels[node] = f"{node}\nMT: {mt_haplogroup}\nY: {y_haplogroup}"
|
|
1358
|
+
else:
|
|
1359
|
+
node_labels[node] = f"{node}\nMT: {mt_haplogroup}"
|
|
1360
|
+
|
|
1361
|
+
# Create colormap for MT haplogroups
|
|
1362
|
+
if not mt_haplogroup_to_color:
|
|
1363
|
+
cmap = plt.get_cmap("tab20")
|
|
1364
|
+
mt_haplogroups = sorted(
|
|
1365
|
+
set(
|
|
1366
|
+
[
|
|
1367
|
+
self.get_data(node)["mt_haplogroup"].replace("*", "")
|
|
1368
|
+
for node in self.node_to_data
|
|
1369
|
+
if not node.isnumeric()
|
|
1370
|
+
]
|
|
1371
|
+
)
|
|
1372
|
+
)
|
|
1373
|
+
mt_haplogroup_to_color = {
|
|
1374
|
+
haplogroup: cmap(i / len(mt_haplogroups)) for i, haplogroup in enumerate(mt_haplogroups)
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
# Specify alpha here instead of in nx.draw_networkx_nodes so node borders stay opaque
|
|
1378
|
+
face_alpha = 0.5
|
|
1379
|
+
male_named_node_colors = [
|
|
1380
|
+
to_rgba(mt_haplogroup_to_color[self.get_data(node)["mt_haplogroup"].replace("*", "")], face_alpha)
|
|
1381
|
+
for node in male_named_nodes
|
|
1382
|
+
]
|
|
1383
|
+
female_named_node_colors = [
|
|
1384
|
+
to_rgba(mt_haplogroup_to_color[self.get_data(node)["mt_haplogroup"].replace("*", "")], face_alpha)
|
|
1385
|
+
for node in female_named_nodes
|
|
1386
|
+
]
|
|
1387
|
+
male_placeholder_node_colors = [to_rgba("#e5e5e5", face_alpha) for node in male_placeholder_nodes]
|
|
1388
|
+
female_placeholder_node_colors = [to_rgba("#e5e5e5", face_alpha) for node in female_placeholder_nodes]
|
|
1389
|
+
|
|
1390
|
+
plt.figure(figsize=(12, 4.8), dpi=1200)
|
|
1391
|
+
# Scale sizes based on pedigree node count
|
|
1392
|
+
node_size = min(1000, 9000 / len(tree.nodes))
|
|
1393
|
+
# Matplotlib doesn't allow font size less than 1
|
|
1394
|
+
if font_size is None and plot_haplogroups:
|
|
1395
|
+
font_size = max(math.sqrt(node_size) / 5, 1)
|
|
1396
|
+
elif font_size is None and not plot_haplogroups:
|
|
1397
|
+
font_size = max(math.sqrt(node_size) / 4.25, 1)
|
|
1398
|
+
line_width = math.sqrt(node_size) / 100
|
|
1399
|
+
|
|
1400
|
+
pos = nx.nx_agraph.graphviz_layout(tree, prog="dot")
|
|
1401
|
+
nx.draw_networkx_nodes(
|
|
1402
|
+
tree,
|
|
1403
|
+
pos=pos,
|
|
1404
|
+
nodelist=male_named_nodes,
|
|
1405
|
+
node_shape="s",
|
|
1406
|
+
node_size=node_size,
|
|
1407
|
+
node_color=male_named_node_colors,
|
|
1408
|
+
edgecolors="black",
|
|
1409
|
+
linewidths=line_width,
|
|
1410
|
+
)
|
|
1411
|
+
nx.draw_networkx_nodes(
|
|
1412
|
+
tree,
|
|
1413
|
+
pos=pos,
|
|
1414
|
+
nodelist=female_named_nodes,
|
|
1415
|
+
node_shape="o",
|
|
1416
|
+
node_size=node_size,
|
|
1417
|
+
node_color=female_named_node_colors,
|
|
1418
|
+
edgecolors="black",
|
|
1419
|
+
linewidths=line_width,
|
|
1420
|
+
)
|
|
1421
|
+
nx.draw_networkx_nodes(
|
|
1422
|
+
tree,
|
|
1423
|
+
pos=pos,
|
|
1424
|
+
nodelist=male_placeholder_nodes,
|
|
1425
|
+
node_shape="s",
|
|
1426
|
+
node_size=node_size,
|
|
1427
|
+
node_color=male_placeholder_node_colors,
|
|
1428
|
+
edgecolors="black",
|
|
1429
|
+
linewidths=line_width,
|
|
1430
|
+
)
|
|
1431
|
+
nx.draw_networkx_nodes(
|
|
1432
|
+
tree,
|
|
1433
|
+
pos=pos,
|
|
1434
|
+
nodelist=female_placeholder_nodes,
|
|
1435
|
+
node_shape="o",
|
|
1436
|
+
node_size=node_size,
|
|
1437
|
+
node_color=female_placeholder_node_colors,
|
|
1438
|
+
edgecolors="black",
|
|
1439
|
+
linewidths=line_width,
|
|
1440
|
+
)
|
|
1441
|
+
nx.draw_networkx_labels(tree, pos=pos, labels=node_labels, font_size=font_size)
|
|
1442
|
+
nx.draw_networkx_edges(
|
|
1443
|
+
tree,
|
|
1444
|
+
edgelist=parent_child_edges,
|
|
1445
|
+
pos=pos,
|
|
1446
|
+
node_shape="s",
|
|
1447
|
+
node_size=node_size,
|
|
1448
|
+
width=line_width,
|
|
1449
|
+
arrowsize=line_width * 30,
|
|
1450
|
+
edge_color="black",
|
|
1451
|
+
)
|
|
1452
|
+
# Setting arrows=False causes edges to overlap their associated nodes for some reason
|
|
1453
|
+
nx.draw_networkx_edges(
|
|
1454
|
+
tree,
|
|
1455
|
+
edgelist=dotted_edges,
|
|
1456
|
+
pos=pos,
|
|
1457
|
+
node_shape="s",
|
|
1458
|
+
node_size=node_size,
|
|
1459
|
+
width=line_width * 1.5,
|
|
1460
|
+
arrowstyle="-",
|
|
1461
|
+
style=(0, (3, 3)),
|
|
1462
|
+
edge_color="blue",
|
|
1463
|
+
)
|
|
1464
|
+
|
|
1465
|
+
plt.axis("off")
|
|
1466
|
+
plt.savefig(path, bbox_inches="tight")
|
|
1467
|
+
plt.close()
|
|
1468
|
+
|
|
1469
|
+
def write_exact_relations(self, path: str) -> None:
|
|
1470
|
+
"""
|
|
1471
|
+
Write the exact relations in the pedigree to a file.
|
|
1472
|
+
"""
|
|
1473
|
+
non_placeholder_nodes = sorted(self.get_non_placeholder_nodes())
|
|
1474
|
+
with open(path, "w") as file:
|
|
1475
|
+
file.write("id1,id2,relation\n")
|
|
1476
|
+
for i in range(len(non_placeholder_nodes)):
|
|
1477
|
+
for j in range(i + 1, len(non_placeholder_nodes)):
|
|
1478
|
+
node1 = non_placeholder_nodes[i]
|
|
1479
|
+
node2 = non_placeholder_nodes[j]
|
|
1480
|
+
|
|
1481
|
+
pair_relations = self.get_relations_between_nodes(node1, node2, include_maternal_paternal=True)
|
|
1482
|
+
for relation, count in pair_relations.items():
|
|
1483
|
+
for _ in range(count):
|
|
1484
|
+
file.write(f"{node1},{node2},{relation}\n")
|