repare 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repare/__init__.py +4 -0
- repare/main.py +89 -0
- repare/pedigree.py +1484 -0
- repare/pedigree_reconstructor.py +1186 -0
- repare-0.1.4.dist-info/METADATA +143 -0
- repare-0.1.4.dist-info/RECORD +10 -0
- repare-0.1.4.dist-info/WHEEL +5 -0
- repare-0.1.4.dist-info/entry_points.txt +2 -0
- repare-0.1.4.dist-info/licenses/LICENSE +7 -0
- repare-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1186 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import logging
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from repare.pedigree import Pedigree
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PedigreeReconstructor:
|
|
18
|
+
"""
|
|
19
|
+
Manages and builds up a collection of potential Pedigrees.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
ALLOWED_CONSTRAINTS: frozenset[str] = frozenset(
|
|
23
|
+
{
|
|
24
|
+
"parent-child",
|
|
25
|
+
"child-parent",
|
|
26
|
+
"siblings",
|
|
27
|
+
"maternal aunt/uncle-nephew/niece",
|
|
28
|
+
"maternal nephew/niece-aunt/uncle",
|
|
29
|
+
"paternal aunt/uncle-nephew/niece",
|
|
30
|
+
"paternal nephew/niece-aunt/uncle",
|
|
31
|
+
"maternal grandparent-grandchild",
|
|
32
|
+
"maternal grandchild-grandparent",
|
|
33
|
+
"paternal grandparent-grandchild",
|
|
34
|
+
"paternal grandchild-grandparent",
|
|
35
|
+
"maternal half-siblings",
|
|
36
|
+
"paternal half-siblings",
|
|
37
|
+
"double cousins",
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def get_allowed_constraints(cls) -> frozenset[str]:
|
|
43
|
+
"""
|
|
44
|
+
Returns the set of allowed constraint strings.
|
|
45
|
+
"""
|
|
46
|
+
return cls.ALLOWED_CONSTRAINTS
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
relations_path: Path | str,
|
|
51
|
+
nodes_path: Path | str,
|
|
52
|
+
outputs_dir: Path | str,
|
|
53
|
+
max_candidate_pedigrees: int = 1000,
|
|
54
|
+
epsilon: float = 0.2,
|
|
55
|
+
plot: bool = True,
|
|
56
|
+
plot_haplogroups: bool = True,
|
|
57
|
+
write_alternate_pedigrees: bool = False,
|
|
58
|
+
random_seed: Any = 42,
|
|
59
|
+
) -> None:
|
|
60
|
+
relations_path = Path(relations_path)
|
|
61
|
+
nodes_path = Path(nodes_path)
|
|
62
|
+
outputs_dir = Path(outputs_dir)
|
|
63
|
+
self._start_time = time.time()
|
|
64
|
+
self._validate_node_data(nodes_path)
|
|
65
|
+
self._process_node_data()
|
|
66
|
+
self._validate_relation_data(relations_path)
|
|
67
|
+
self._process_relation_data()
|
|
68
|
+
|
|
69
|
+
self._outputs_dir = outputs_dir
|
|
70
|
+
# Number of pedigrees to downsample to after each iteration of algorithm
|
|
71
|
+
self._max_candidate_pedigrees = max_candidate_pedigrees
|
|
72
|
+
# Parameter for epsilon-greedy sampling when pruning pedigrees
|
|
73
|
+
self._epsilon = epsilon
|
|
74
|
+
# Whether to plot the reconstructed pedigree(s)
|
|
75
|
+
self._plot = plot
|
|
76
|
+
# Whether to plot haplogroups of the reconstructed pedigree(s)
|
|
77
|
+
self._plot_haplogroups = plot_haplogroups
|
|
78
|
+
# Whether to write corrected relations and plots of alternate final pedigrees
|
|
79
|
+
self._write_alternate_pedigrees = write_alternate_pedigrees
|
|
80
|
+
self._random_seed = random_seed
|
|
81
|
+
self._rng = random.Random(self._random_seed)
|
|
82
|
+
self._validate_arguments()
|
|
83
|
+
|
|
84
|
+
# Maximum number of times to run the algorithm if no valid pedigree is found
|
|
85
|
+
self._MAX_RUNS = 10
|
|
86
|
+
self._candidate_pedigrees: list[Pedigree] = [self._get_initial_pedigree()]
|
|
87
|
+
self._pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = self._get_pair_to_constraints()
|
|
88
|
+
self._final_pedigrees: list[Pedigree] = []
|
|
89
|
+
|
|
90
|
+
def _validate_node_data(self, nodes_path: Path | str) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Validate node data input.
|
|
93
|
+
"""
|
|
94
|
+
self._node_data = pd.read_csv(nodes_path, dtype=str, comment="#", keep_default_na=False)
|
|
95
|
+
for mandatory_column in ["id", "sex", "y_haplogroup", "mt_haplogroup"]:
|
|
96
|
+
if mandatory_column not in self._node_data.columns:
|
|
97
|
+
raise ValueError(f'Column "{mandatory_column}" not found in input node data.')
|
|
98
|
+
|
|
99
|
+
for optional_column in ["can_have_children", "can_be_inbred", "years_before_present"]:
|
|
100
|
+
if optional_column not in self._node_data.columns:
|
|
101
|
+
self._node_data[optional_column] = ""
|
|
102
|
+
|
|
103
|
+
# Numeric IDs are used for placeholder nodes
|
|
104
|
+
if self._node_data["id"].str.isnumeric().any():
|
|
105
|
+
raise ValueError("Sample IDs cannot be completely numeric.")
|
|
106
|
+
|
|
107
|
+
if self._node_data["id"].duplicated().any():
|
|
108
|
+
raise ValueError("Sample IDs must be unique.")
|
|
109
|
+
|
|
110
|
+
if self._node_data["id"].str.strip().eq("").any():
|
|
111
|
+
raise ValueError("Sample IDs cannot be empty.")
|
|
112
|
+
|
|
113
|
+
if not self._node_data["sex"].isin(["M", "F"]).all():
|
|
114
|
+
raise ValueError('Node sex must be "M" or "F".')
|
|
115
|
+
|
|
116
|
+
for haplogroup_column in ["y_haplogroup", "mt_haplogroup"]:
|
|
117
|
+
for haplogroup in self._node_data[haplogroup_column]:
|
|
118
|
+
if "*" in haplogroup[:-1]:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"Expandable haplogroups should contain one trailing asterisk. "
|
|
121
|
+
"No other asterisks are allowed in haplogroups."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if not self._node_data["can_have_children"].isin(["True", "False", ""]).all():
|
|
125
|
+
raise ValueError('can_have_children value must be "True", "False", or empty.')
|
|
126
|
+
if not self._node_data["can_be_inbred"].isin(["True", "False", ""]).all():
|
|
127
|
+
raise ValueError('can_be_inbred value must be "True", "False", or empty.')
|
|
128
|
+
if not self._node_data["years_before_present"].apply(lambda x: x.isnumeric() or x == "").all():
|
|
129
|
+
raise ValueError("years_before_present value must be integer or empty.")
|
|
130
|
+
|
|
131
|
+
def _process_node_data(self) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Process node data input.
|
|
134
|
+
"""
|
|
135
|
+
# Reorder node data columns and remove unnecessary columns
|
|
136
|
+
self._node_data = self._node_data[
|
|
137
|
+
["id", "sex", "y_haplogroup", "mt_haplogroup", "can_have_children", "can_be_inbred", "years_before_present"]
|
|
138
|
+
]
|
|
139
|
+
# Convert "can_have_children" and "can_be_inbred" columns to booleans
|
|
140
|
+
self._node_data["can_have_children"] = self._node_data["can_have_children"].map(
|
|
141
|
+
{"False": False, "True": True, "": True}
|
|
142
|
+
)
|
|
143
|
+
self._node_data["can_be_inbred"] = self._node_data["can_be_inbred"].map(
|
|
144
|
+
{"False": False, "True": True, "": True}
|
|
145
|
+
)
|
|
146
|
+
# Convert "years_before_present" column to floats
|
|
147
|
+
self._node_data["years_before_present"] = pd.to_numeric(
|
|
148
|
+
self._node_data["years_before_present"], errors="coerce"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _validate_relation_data(self, relations_path: Path | str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Validate relation data input.
|
|
154
|
+
"""
|
|
155
|
+
self._relation_data = pd.read_csv(relations_path, dtype=str, comment="#", keep_default_na=False)
|
|
156
|
+
for column_name in ["id1", "id2", "degree", "constraints"]:
|
|
157
|
+
if column_name not in self._relation_data.columns:
|
|
158
|
+
raise ValueError(f'Column "{column_name}" not found in input relation data.')
|
|
159
|
+
|
|
160
|
+
for optional_column in ["force_constraints"]:
|
|
161
|
+
if optional_column not in self._relation_data.columns:
|
|
162
|
+
self._relation_data[optional_column] = ""
|
|
163
|
+
|
|
164
|
+
excess_relation_nodes = set(self._relation_data["id1"]).union(set(self._relation_data["id2"])) - set(
|
|
165
|
+
self._node_data["id"]
|
|
166
|
+
)
|
|
167
|
+
if excess_relation_nodes:
|
|
168
|
+
raise ValueError(f"All node IDs in relation data must be present in node data: {excess_relation_nodes}.")
|
|
169
|
+
|
|
170
|
+
if not self._relation_data["degree"].isin(["1", "2", "3"]).all():
|
|
171
|
+
raise ValueError("Degree must be 1, 2, or 3.")
|
|
172
|
+
if not self._relation_data["force_constraints"].isin(["True", "False", ""]).all():
|
|
173
|
+
raise ValueError('can_have_children value must be "True", "False", or empty.')
|
|
174
|
+
|
|
175
|
+
self._relation_data["pair_degree"] = self._relation_data.apply(
|
|
176
|
+
lambda row: tuple(sorted([row["id1"], row["id2"], row["degree"]])), axis=1
|
|
177
|
+
)
|
|
178
|
+
grouped_relations = self._relation_data.groupby("pair_degree")
|
|
179
|
+
# Check for groups with multiple non-empty constraints, which can lead to issues when counting inconsistencies
|
|
180
|
+
invalid_groups = grouped_relations.filter(lambda group: (group["constraints"] != "").sum() > 1)
|
|
181
|
+
if not invalid_groups.empty:
|
|
182
|
+
raise ValueError("Node pairs cannot have multiple non-empty constraints of the same degree.")
|
|
183
|
+
self._relation_data.drop("pair_degree", axis=1, inplace=True)
|
|
184
|
+
|
|
185
|
+
allowed_constraints = self.get_allowed_constraints()
|
|
186
|
+
|
|
187
|
+
def split_and_validate_constraints(constraints: str) -> None:
|
|
188
|
+
if constraints:
|
|
189
|
+
constraints_list = [c for c in constraints.split(";")]
|
|
190
|
+
if any(c not in allowed_constraints for c in constraints_list):
|
|
191
|
+
raise ValueError(
|
|
192
|
+
f"Invalid constraints found: {[c for c in constraints_list if c not in allowed_constraints]}."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
self._relation_data["constraints"].apply(split_and_validate_constraints)
|
|
196
|
+
|
|
197
|
+
def _process_relation_data(self) -> None:
|
|
198
|
+
"""
|
|
199
|
+
Process relation data input.
|
|
200
|
+
"""
|
|
201
|
+
# Reorder relation data columns and remove unnecessary columns
|
|
202
|
+
self._relation_data = self._relation_data[["id1", "id2", "degree", "constraints", "force_constraints"]]
|
|
203
|
+
# Convert "force_constrains" column to booleans
|
|
204
|
+
self._relation_data["force_constraints"] = self._relation_data["force_constraints"].map(
|
|
205
|
+
{"False": False, "True": True, "": False}
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
def sort_nodes(row: pd.Series) -> pd.Series:
|
|
209
|
+
"""
|
|
210
|
+
Ensure id1 and id2 are in a fixed (sorted) order and flip constraints as needed.
|
|
211
|
+
"""
|
|
212
|
+
# Map constraints to their flipped value
|
|
213
|
+
flipped_constraints = {
|
|
214
|
+
"parent-child": "child-parent",
|
|
215
|
+
"child-parent": "parent-child",
|
|
216
|
+
"maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
|
|
217
|
+
"paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
|
|
218
|
+
"maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
|
|
219
|
+
"paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
|
|
220
|
+
"maternal grandparent-grandchild": "maternal grandchild-grandparent",
|
|
221
|
+
"paternal grandparent-grandchild": "paternal grandchild-grandparent",
|
|
222
|
+
"maternal grandchild-grandparent": "maternal grandparent-grandchild",
|
|
223
|
+
"paternal grandchild-grandparent": "paternal grandparent-grandchild",
|
|
224
|
+
"siblings": "siblings", # Symmetric
|
|
225
|
+
"maternal half-siblings": "maternal half-siblings", # Symmetric
|
|
226
|
+
"paternal half-siblings": "paternal half-siblings", # Symmetric
|
|
227
|
+
"double cousins": "double cousins", # Symmetric
|
|
228
|
+
}
|
|
229
|
+
if row["id2"] < row["id1"]:
|
|
230
|
+
constraints = row["constraints"]
|
|
231
|
+
# Split constraints and map each to its flipped value
|
|
232
|
+
if constraints:
|
|
233
|
+
constraints_list = [c.strip() for c in constraints.split(";")]
|
|
234
|
+
flipped = [flipped_constraints[c] for c in constraints_list]
|
|
235
|
+
relation_flipped_constraints = ";".join(flipped)
|
|
236
|
+
else:
|
|
237
|
+
relation_flipped_constraints = ""
|
|
238
|
+
# Swap id1 and id2, and flip constraints
|
|
239
|
+
return pd.Series(
|
|
240
|
+
{
|
|
241
|
+
"id1": row["id2"],
|
|
242
|
+
"id2": row["id1"],
|
|
243
|
+
"degree": row["degree"],
|
|
244
|
+
"constraints": relation_flipped_constraints,
|
|
245
|
+
"force_constraints": row["force_constraints"],
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
return row
|
|
250
|
+
|
|
251
|
+
self._relation_data = self._relation_data.apply(sort_nodes, axis=1)
|
|
252
|
+
|
|
253
|
+
# Note: We don't use maternal/paternal 3rd-degree relations because those are not well-defined
|
|
254
|
+
self._DEFAULT_CONSTRAINTS = {
|
|
255
|
+
"1": ("parent-child;child-parent;siblings"),
|
|
256
|
+
"2": (
|
|
257
|
+
"maternal aunt/uncle-nephew/niece;"
|
|
258
|
+
"maternal nephew/niece-aunt/uncle;"
|
|
259
|
+
"paternal aunt/uncle-nephew/niece;"
|
|
260
|
+
"paternal nephew/niece-aunt/uncle;"
|
|
261
|
+
"maternal grandparent-grandchild;"
|
|
262
|
+
"maternal grandchild-grandparent;"
|
|
263
|
+
"paternal grandparent-grandchild;"
|
|
264
|
+
"paternal grandchild-grandparent;"
|
|
265
|
+
"maternal half-siblings;"
|
|
266
|
+
"paternal half-siblings;"
|
|
267
|
+
"double cousins"
|
|
268
|
+
),
|
|
269
|
+
"3": (
|
|
270
|
+
"half aunt/uncle-half nephew/niece;"
|
|
271
|
+
"half nephew/niece-half aunt/uncle;"
|
|
272
|
+
"greatgrandparent-greatgrandchild;"
|
|
273
|
+
"greatgrandchild-greatgrandparent;"
|
|
274
|
+
"grandaunt/granduncle-grandnephew/grandniece;"
|
|
275
|
+
"grandnephew/grandniece-grandaunt/granduncle;"
|
|
276
|
+
"first cousins"
|
|
277
|
+
),
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
def fill_constraints(row: pd.Series) -> pd.Series:
|
|
281
|
+
if not row["constraints"]:
|
|
282
|
+
constraints = self._DEFAULT_CONSTRAINTS[row["degree"]]
|
|
283
|
+
return pd.Series(
|
|
284
|
+
{
|
|
285
|
+
"id1": row["id1"],
|
|
286
|
+
"id2": row["id2"],
|
|
287
|
+
"degree": row["degree"],
|
|
288
|
+
"constraints": constraints,
|
|
289
|
+
"force_constraints": row["force_constraints"],
|
|
290
|
+
}
|
|
291
|
+
)
|
|
292
|
+
return row
|
|
293
|
+
|
|
294
|
+
self._relation_data = self._relation_data.apply(fill_constraints, axis=1)
|
|
295
|
+
self._set_relation_processing_order()
|
|
296
|
+
|
|
297
|
+
def _set_relation_processing_order(self) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Sort relations within each degree so nodes with more kinship relations are processed first.
|
|
300
|
+
"""
|
|
301
|
+
node_relation_counts = pd.concat([self._relation_data["id1"], self._relation_data["id2"]]).value_counts()
|
|
302
|
+
|
|
303
|
+
def prioritize(relations: pd.DataFrame) -> pd.DataFrame:
|
|
304
|
+
node1_counts = relations["id1"].map(node_relation_counts)
|
|
305
|
+
node2_counts = relations["id2"].map(node_relation_counts)
|
|
306
|
+
prioritized = relations.assign(
|
|
307
|
+
max_node_degree=pd.concat([node1_counts, node2_counts], axis=1).max(axis=1),
|
|
308
|
+
total_node_degree=node1_counts + node2_counts,
|
|
309
|
+
)
|
|
310
|
+
prioritized = prioritized.sort_values(
|
|
311
|
+
by=["max_node_degree", "total_node_degree"],
|
|
312
|
+
ascending=[False, False],
|
|
313
|
+
)
|
|
314
|
+
return prioritized.drop(columns=["max_node_degree", "total_node_degree"]).reset_index(drop=True)
|
|
315
|
+
|
|
316
|
+
self._first_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "1"])
|
|
317
|
+
self._second_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "2"])
|
|
318
|
+
self._third_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "3"])
|
|
319
|
+
self._first_and_second_degree_relations = pd.concat(
|
|
320
|
+
[self._first_degree_relations, self._second_degree_relations]
|
|
321
|
+
).reset_index(drop=True)
|
|
322
|
+
self._all_relations = pd.concat(
|
|
323
|
+
[self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
|
|
324
|
+
).reset_index(drop=True)
|
|
325
|
+
|
|
326
|
+
def _validate_arguments(self) -> None:
|
|
327
|
+
"""
|
|
328
|
+
Validate constructor arguments.
|
|
329
|
+
"""
|
|
330
|
+
if not isinstance(self._max_candidate_pedigrees, int) or self._max_candidate_pedigrees <= 0:
|
|
331
|
+
raise ValueError("max_candidate_pedigrees must be a positive integer.")
|
|
332
|
+
if not (0 <= self._epsilon <= 1):
|
|
333
|
+
raise ValueError("epsilon must be between 0 and 1.")
|
|
334
|
+
|
|
335
|
+
def _shuffle_relations(self) -> None:
|
|
336
|
+
"""
|
|
337
|
+
Shuffle relation DataFrames (when we want to restart the algorithm).
|
|
338
|
+
"""
|
|
339
|
+
self._first_degree_relations = self._first_degree_relations.sample(
|
|
340
|
+
frac=1, random_state=self._rng.randint(0, 1_000_000)
|
|
341
|
+
).reset_index(drop=True)
|
|
342
|
+
self._second_degree_relations = self._second_degree_relations.sample(
|
|
343
|
+
frac=1, random_state=self._rng.randint(0, 1_000_000)
|
|
344
|
+
).reset_index(drop=True)
|
|
345
|
+
self._third_degree_relations = self._third_degree_relations.sample(
|
|
346
|
+
frac=1, random_state=self._rng.randint(0, 1_000_000)
|
|
347
|
+
).reset_index(drop=True)
|
|
348
|
+
self._first_and_second_degree_relations = pd.concat(
|
|
349
|
+
[self._first_degree_relations, self._second_degree_relations]
|
|
350
|
+
).reset_index(drop=True)
|
|
351
|
+
self._all_relations = pd.concat(
|
|
352
|
+
[self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
|
|
353
|
+
).reset_index(drop=True)
|
|
354
|
+
|
|
355
|
+
def _get_initial_pedigree(self):
|
|
356
|
+
"""
|
|
357
|
+
Create the initial pedigree and add all nodes.
|
|
358
|
+
"""
|
|
359
|
+
initial_pedigree = Pedigree()
|
|
360
|
+
for (
|
|
361
|
+
node_id,
|
|
362
|
+
sex,
|
|
363
|
+
y_haplogroup,
|
|
364
|
+
mt_haplogroup,
|
|
365
|
+
can_have_children,
|
|
366
|
+
can_be_inbred,
|
|
367
|
+
years_before_present,
|
|
368
|
+
) in self._node_data.itertuples(index=False):
|
|
369
|
+
initial_pedigree.add_node(
|
|
370
|
+
node_id, sex, y_haplogroup, mt_haplogroup, can_have_children, can_be_inbred, years_before_present
|
|
371
|
+
)
|
|
372
|
+
return initial_pedigree
|
|
373
|
+
|
|
374
|
+
def find_best_pedigree(self) -> Pedigree:
|
|
375
|
+
"""
|
|
376
|
+
Finds the configuration of relations that yields the "best" pedigree.
|
|
377
|
+
Writes to output_dir the set of relations with the least changes from the original input data.
|
|
378
|
+
"""
|
|
379
|
+
for _ in range(self._MAX_RUNS):
|
|
380
|
+
progress_bar = tqdm(
|
|
381
|
+
self._first_and_second_degree_relations.iterrows(),
|
|
382
|
+
total=self._first_and_second_degree_relations.shape[0],
|
|
383
|
+
smoothing=0.5,
|
|
384
|
+
bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
|
385
|
+
)
|
|
386
|
+
for idx, row in progress_bar:
|
|
387
|
+
node1, node2, degree, constraints, force_constraints = row
|
|
388
|
+
logger.info(f"Current relation: {node1}, {node2}, {degree}")
|
|
389
|
+
progress_bar.set_description(f"Processing relation {{{node1}, {node2}, {degree}}}")
|
|
390
|
+
self._add_relation(
|
|
391
|
+
node1, node2, degree=degree, constraints=constraints, force_constraints=force_constraints
|
|
392
|
+
)
|
|
393
|
+
self._clean_pedigree_data()
|
|
394
|
+
self._validate_pedigree_structures()
|
|
395
|
+
|
|
396
|
+
processed_relations = self._all_relations.iloc[: idx + 1]
|
|
397
|
+
pair_to_relations_so_far = self._get_pair_to_relations_so_far(processed_relations)
|
|
398
|
+
if degree == "1" and len(processed_relations) < len(self._first_and_second_degree_relations):
|
|
399
|
+
# Don't check for extraneous half-sibling relations because
|
|
400
|
+
# the 2 non-shared parents might be "merged" later
|
|
401
|
+
self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=False)
|
|
402
|
+
else:
|
|
403
|
+
self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=True)
|
|
404
|
+
logger.info(
|
|
405
|
+
f"Remaining pedigrees after pruning: {len(self._candidate_pedigrees)}"
|
|
406
|
+
"\t\tElapsed: "
|
|
407
|
+
f"{round(time.time() - self._start_time, 1)} s\n"
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if not self._final_pedigrees:
|
|
411
|
+
logger.warning("No valid pedigree found. Shuffling relations and restarting algorithm.\n")
|
|
412
|
+
self._candidate_pedigrees = [self._get_initial_pedigree()]
|
|
413
|
+
self._shuffle_relations()
|
|
414
|
+
else:
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
if not self._final_pedigrees:
|
|
418
|
+
logger.error(f"No valid pedigree found after {self._MAX_RUNS} runs. Exiting.")
|
|
419
|
+
raise RuntimeError(f"No valid pedigree found after {self._MAX_RUNS} runs.")
|
|
420
|
+
|
|
421
|
+
self._clean_pedigree_data()
|
|
422
|
+
# Plot and write outputs of sample pedigree
|
|
423
|
+
sample_idx = self._rng.randint(0, len(self._final_pedigrees) - 1)
|
|
424
|
+
self._sample_pedigree = self._final_pedigrees[sample_idx]
|
|
425
|
+
self._sample_strike_count = self._final_strike_counts[sample_idx]
|
|
426
|
+
self._sample_strike_log = self._final_strike_logs[sample_idx]
|
|
427
|
+
self._sample_third_degree_strike_count = self._sample_pedigree.count_third_degree_inconsistencies(
|
|
428
|
+
self._pair_to_constraints
|
|
429
|
+
)
|
|
430
|
+
logger.info(
|
|
431
|
+
"Final pedigree strike counts — 1st/2nd degree: %s, 3rd degree: %s",
|
|
432
|
+
self._sample_strike_count,
|
|
433
|
+
self._sample_third_degree_strike_count,
|
|
434
|
+
)
|
|
435
|
+
self._write_corrected_input_relations(
|
|
436
|
+
self._sample_strike_count,
|
|
437
|
+
self._sample_strike_log,
|
|
438
|
+
self._outputs_dir / "corrected_input_relations.csv",
|
|
439
|
+
)
|
|
440
|
+
self._sample_pedigree.write_exact_relations(self._outputs_dir / "reconstructed_exact_relations.csv")
|
|
441
|
+
if self._plot:
|
|
442
|
+
try:
|
|
443
|
+
self._sample_pedigree.plot(
|
|
444
|
+
self._outputs_dir / "reconstructed_pedigree.pdf", plot_haplogroups=self._plot_haplogroups
|
|
445
|
+
)
|
|
446
|
+
pygraphviz_found = True
|
|
447
|
+
except ImportError:
|
|
448
|
+
logger.warning(
|
|
449
|
+
"PyGraphviz (https://pygraphviz.github.io/) must be installed to plot pedigrees. "
|
|
450
|
+
"Skipping plotting of reconstructed pedigree(s)."
|
|
451
|
+
)
|
|
452
|
+
pygraphviz_found = False
|
|
453
|
+
|
|
454
|
+
# Plot and write outputs of alternate pedigrees
|
|
455
|
+
if self._write_alternate_pedigrees:
|
|
456
|
+
alternate_dir = self._outputs_dir / "alternate_pedigrees"
|
|
457
|
+
alternate_dir.mkdir(parents=True, exist_ok=True)
|
|
458
|
+
for idx, (pedigree, strike_count, strike_log) in enumerate(
|
|
459
|
+
zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs, strict=True)
|
|
460
|
+
):
|
|
461
|
+
# Skip sample pedigree since it is already written
|
|
462
|
+
if idx == sample_idx:
|
|
463
|
+
continue
|
|
464
|
+
|
|
465
|
+
self._write_corrected_input_relations(
|
|
466
|
+
strike_count,
|
|
467
|
+
strike_log,
|
|
468
|
+
alternate_dir / f"pedigree_{idx}_corrected_input_relations.csv",
|
|
469
|
+
)
|
|
470
|
+
pedigree.write_exact_relations(alternate_dir / f"pedigree_{idx}_exact_relations.csv")
|
|
471
|
+
if self._plot and pygraphviz_found:
|
|
472
|
+
pedigree.plot(alternate_dir / f"pedigree_{idx}.png", plot_haplogroups=self._plot_haplogroups)
|
|
473
|
+
self._write_constant_relations(alternate_dir / "constant_relations.csv")
|
|
474
|
+
|
|
475
|
+
return self._sample_pedigree
|
|
476
|
+
|
|
477
|
+
@staticmethod
|
|
478
|
+
def _check_haplogroups(haplogroup1: str, haplogroup2: str) -> bool:
|
|
479
|
+
"""
|
|
480
|
+
Checks if two haplogroups are compatible. Same semantics as pedigree.validate_haplogroups().
|
|
481
|
+
"*" is wild card character.
|
|
482
|
+
"""
|
|
483
|
+
if not haplogroup1 or not haplogroup2: # empty OK
|
|
484
|
+
return True
|
|
485
|
+
haplogroup1_stripped, haplogroup2_stripped = haplogroup1.rstrip("*"), haplogroup2.rstrip("*")
|
|
486
|
+
return haplogroup1_stripped.startswith(haplogroup2_stripped) or haplogroup2_stripped.startswith(
|
|
487
|
+
haplogroup1_stripped
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
@staticmethod
|
|
491
|
+
def _check_parent_child_haplogroups(pedigree: Pedigree, parent: str, child: str) -> bool:
|
|
492
|
+
"""
|
|
493
|
+
Checks if the haplogroups of a parent and child are compatible.
|
|
494
|
+
"""
|
|
495
|
+
if pedigree.get_data(parent)["sex"] == "M" and pedigree.get_data(child)["sex"] == "M":
|
|
496
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
497
|
+
pedigree.get_data(parent)["y_haplogroup"], pedigree.get_data(child)["y_haplogroup"]
|
|
498
|
+
)
|
|
499
|
+
if pedigree.get_data(parent)["sex"] == "F":
|
|
500
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
501
|
+
pedigree.get_data(parent)["mt_haplogroup"], pedigree.get_data(child)["mt_haplogroup"]
|
|
502
|
+
)
|
|
503
|
+
return True
|
|
504
|
+
|
|
505
|
+
@staticmethod
|
|
506
|
+
def _check_sibling_haplogroups(pedigree: Pedigree, sibling1: str, sibling2: str) -> bool:
|
|
507
|
+
"""
|
|
508
|
+
Checks if the haplogroups of two full siblings are compatible.
|
|
509
|
+
"""
|
|
510
|
+
if pedigree.get_data(sibling1)["sex"] == "M" and pedigree.get_data(sibling2)["sex"] == "M":
|
|
511
|
+
# MT haplogroups still need to agree as well
|
|
512
|
+
if not PedigreeReconstructor._check_haplogroups(
|
|
513
|
+
pedigree.get_data(sibling1)["y_haplogroup"], pedigree.get_data(sibling2)["y_haplogroup"]
|
|
514
|
+
):
|
|
515
|
+
return False
|
|
516
|
+
# All full siblings should share MT haplogroups
|
|
517
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
518
|
+
pedigree.get_data(sibling1)["mt_haplogroup"], pedigree.get_data(sibling2)["mt_haplogroup"]
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
@staticmethod
|
|
522
|
+
def _check_aunt_uncle_nephew_niece_haplogroups(
|
|
523
|
+
pedigree: Pedigree, aunt_uncle: str, nephew_niece: str, shared_relative_sex: str | None
|
|
524
|
+
) -> bool:
|
|
525
|
+
"""
|
|
526
|
+
Checks if the haplogroups of an aunt/uncle and nephew/niece are compatible.
|
|
527
|
+
"""
|
|
528
|
+
if not shared_relative_sex:
|
|
529
|
+
return True
|
|
530
|
+
|
|
531
|
+
if (
|
|
532
|
+
shared_relative_sex == "M"
|
|
533
|
+
and pedigree.get_data(aunt_uncle)["sex"] == "M"
|
|
534
|
+
and pedigree.get_data(nephew_niece)["sex"] == "M"
|
|
535
|
+
):
|
|
536
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
537
|
+
pedigree.get_data(aunt_uncle)["y_haplogroup"], pedigree.get_data(nephew_niece)["y_haplogroup"]
|
|
538
|
+
)
|
|
539
|
+
if shared_relative_sex == "F":
|
|
540
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
541
|
+
pedigree.get_data(aunt_uncle)["mt_haplogroup"], pedigree.get_data(nephew_niece)["mt_haplogroup"]
|
|
542
|
+
)
|
|
543
|
+
return True
|
|
544
|
+
|
|
545
|
+
@staticmethod
|
|
546
|
+
def _check_grandparent_grandchild_haplogroups(
|
|
547
|
+
pedigree: Pedigree, grandparent: str, grandchild: str, shared_relative_sex: str | None
|
|
548
|
+
) -> bool:
|
|
549
|
+
"""
|
|
550
|
+
Checks if the haplogroups of a grandparent and grandchild are compatible.
|
|
551
|
+
"""
|
|
552
|
+
if not shared_relative_sex:
|
|
553
|
+
return True
|
|
554
|
+
|
|
555
|
+
if (
|
|
556
|
+
shared_relative_sex == "M"
|
|
557
|
+
and pedigree.get_data(grandparent)["sex"] == "M"
|
|
558
|
+
and pedigree.get_data(grandchild)["sex"] == "M"
|
|
559
|
+
):
|
|
560
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
561
|
+
pedigree.get_data(grandparent)["y_haplogroup"], pedigree.get_data(grandchild)["y_haplogroup"]
|
|
562
|
+
)
|
|
563
|
+
if shared_relative_sex == "F" and pedigree.get_data(grandparent)["sex"] == "F":
|
|
564
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
565
|
+
pedigree.get_data(grandparent)["mt_haplogroup"], pedigree.get_data(grandchild)["mt_haplogroup"]
|
|
566
|
+
)
|
|
567
|
+
return True
|
|
568
|
+
|
|
569
|
+
@staticmethod
|
|
570
|
+
def _check_half_sibling_haplogroups(
|
|
571
|
+
pedigree: Pedigree, half_sibling1: str, half_sibling2: str, shared_relative_sex: str | None
|
|
572
|
+
) -> bool:
|
|
573
|
+
"""
|
|
574
|
+
Checks if the haplogroups of two half-siblings are compatible.
|
|
575
|
+
"""
|
|
576
|
+
if (
|
|
577
|
+
shared_relative_sex == "M"
|
|
578
|
+
and pedigree.get_data(half_sibling1)["sex"] == "M"
|
|
579
|
+
and pedigree.get_data(half_sibling2)["sex"] == "M"
|
|
580
|
+
):
|
|
581
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
582
|
+
pedigree.get_data(half_sibling1)["y_haplogroup"], pedigree.get_data(half_sibling2)["y_haplogroup"]
|
|
583
|
+
)
|
|
584
|
+
if shared_relative_sex == "F":
|
|
585
|
+
return PedigreeReconstructor._check_haplogroups(
|
|
586
|
+
pedigree.get_data(half_sibling1)["mt_haplogroup"], pedigree.get_data(half_sibling2)["mt_haplogroup"]
|
|
587
|
+
)
|
|
588
|
+
return True
|
|
589
|
+
|
|
590
|
+
def _add_relation(self, node1: str, node2: str, degree: str, constraints: str, force_constraints: bool) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Connects two nodes in every pedigree.
|
|
593
|
+
"""
|
|
594
|
+
assert degree in ["1", "2"]
|
|
595
|
+
|
|
596
|
+
new_pedigrees: list[Pedigree] = []
|
|
597
|
+
for pedigree in self._candidate_pedigrees:
|
|
598
|
+
if degree == "1":
|
|
599
|
+
if not force_constraints:
|
|
600
|
+
new_pedigrees.extend(
|
|
601
|
+
PedigreeReconstructor._connect_first_degree_relation(
|
|
602
|
+
pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["1"]
|
|
603
|
+
)
|
|
604
|
+
)
|
|
605
|
+
new_pedigrees.extend(
|
|
606
|
+
PedigreeReconstructor._connect_second_degree_relation(
|
|
607
|
+
pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["2"]
|
|
608
|
+
)
|
|
609
|
+
)
|
|
610
|
+
else:
|
|
611
|
+
new_pedigrees.extend(
|
|
612
|
+
PedigreeReconstructor._connect_first_degree_relation(
|
|
613
|
+
pedigree, node1, node2, constraints=constraints
|
|
614
|
+
)
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
elif degree == "2":
|
|
618
|
+
if not force_constraints:
|
|
619
|
+
new_pedigrees.append(pedigree) # No relation (i.e. false positive)
|
|
620
|
+
new_pedigrees.extend(
|
|
621
|
+
PedigreeReconstructor._connect_first_degree_relation(
|
|
622
|
+
pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["1"]
|
|
623
|
+
)
|
|
624
|
+
)
|
|
625
|
+
new_pedigrees.extend(
|
|
626
|
+
PedigreeReconstructor._connect_second_degree_relation(
|
|
627
|
+
pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["2"]
|
|
628
|
+
)
|
|
629
|
+
)
|
|
630
|
+
else:
|
|
631
|
+
new_pedigrees.extend(
|
|
632
|
+
PedigreeReconstructor._connect_second_degree_relation(
|
|
633
|
+
pedigree, node1, node2, constraints=constraints
|
|
634
|
+
)
|
|
635
|
+
)
|
|
636
|
+
self._candidate_pedigrees = new_pedigrees
|
|
637
|
+
|
|
638
|
+
@staticmethod
|
|
639
|
+
def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
|
|
640
|
+
"""
|
|
641
|
+
Update pedigree with a first-degree relation.
|
|
642
|
+
"""
|
|
643
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
644
|
+
|
|
645
|
+
new_pedigrees: list[Pedigree] = []
|
|
646
|
+
possible_relations: list[str] = constraints.split(";")
|
|
647
|
+
|
|
648
|
+
for relation in possible_relations:
|
|
649
|
+
if relation == "parent-child":
|
|
650
|
+
new_pedigrees.extend(PedigreeReconstructor._connect_parent_relation(pedigree, node1, node2))
|
|
651
|
+
if relation == "child-parent":
|
|
652
|
+
new_pedigrees.extend(PedigreeReconstructor._connect_parent_relation(pedigree, node2, node1))
|
|
653
|
+
if relation == "siblings":
|
|
654
|
+
new_pedigrees.extend(PedigreeReconstructor._connect_sibling_relation(pedigree, node1, node2))
|
|
655
|
+
return new_pedigrees
|
|
656
|
+
|
|
657
|
+
@staticmethod
|
|
658
|
+
def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
|
|
659
|
+
"""
|
|
660
|
+
Update pedigree with a second-degree relation.
|
|
661
|
+
"""
|
|
662
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
663
|
+
|
|
664
|
+
new_pedigrees: list[Pedigree] = []
|
|
665
|
+
possible_relations: list[str] = constraints.split(";")
|
|
666
|
+
|
|
667
|
+
for relation in possible_relations:
|
|
668
|
+
if relation == "maternal aunt/uncle-nephew/niece":
|
|
669
|
+
new_pedigrees.extend(
|
|
670
|
+
PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node1, node2, shared_relative_sex="F")
|
|
671
|
+
)
|
|
672
|
+
if relation == "maternal nephew/niece-aunt/uncle":
|
|
673
|
+
new_pedigrees.extend(
|
|
674
|
+
PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node2, node1, shared_relative_sex="F")
|
|
675
|
+
)
|
|
676
|
+
if relation == "paternal aunt/uncle-nephew/niece":
|
|
677
|
+
new_pedigrees.extend(
|
|
678
|
+
PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node1, node2, shared_relative_sex="M")
|
|
679
|
+
)
|
|
680
|
+
if relation == "paternal nephew/niece-aunt/uncle":
|
|
681
|
+
new_pedigrees.extend(
|
|
682
|
+
PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node2, node1, shared_relative_sex="M")
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if relation == "maternal grandparent-grandchild":
|
|
686
|
+
new_pedigrees.extend(
|
|
687
|
+
PedigreeReconstructor._connect_grandparent_relation(pedigree, node1, node2, shared_relative_sex="F")
|
|
688
|
+
)
|
|
689
|
+
if relation == "maternal grandchild-grandparent":
|
|
690
|
+
new_pedigrees.extend(
|
|
691
|
+
PedigreeReconstructor._connect_grandparent_relation(pedigree, node2, node1, shared_relative_sex="F")
|
|
692
|
+
)
|
|
693
|
+
if relation == "paternal grandparent-grandchild":
|
|
694
|
+
new_pedigrees.extend(
|
|
695
|
+
PedigreeReconstructor._connect_grandparent_relation(pedigree, node1, node2, shared_relative_sex="M")
|
|
696
|
+
)
|
|
697
|
+
if relation == "paternal grandchild-grandparent":
|
|
698
|
+
new_pedigrees.extend(
|
|
699
|
+
PedigreeReconstructor._connect_grandparent_relation(pedigree, node2, node1, shared_relative_sex="M")
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
if relation == "maternal half-siblings":
|
|
703
|
+
new_pedigrees.extend(
|
|
704
|
+
PedigreeReconstructor._connect_half_sibling_relation(
|
|
705
|
+
pedigree, node1, node2, shared_relative_sex="F"
|
|
706
|
+
)
|
|
707
|
+
)
|
|
708
|
+
if relation == "paternal half-siblings":
|
|
709
|
+
new_pedigrees.extend(
|
|
710
|
+
PedigreeReconstructor._connect_half_sibling_relation(
|
|
711
|
+
pedigree, node1, node2, shared_relative_sex="M"
|
|
712
|
+
)
|
|
713
|
+
)
|
|
714
|
+
if relation == "double cousins":
|
|
715
|
+
new_pedigrees.extend(PedigreeReconstructor._connect_double_cousin_relation(pedigree, node1, node2))
|
|
716
|
+
return new_pedigrees
|
|
717
|
+
|
|
718
|
+
@staticmethod
|
|
719
|
+
def _connect_parent_relation(pedigree: Pedigree, node1: str, node2: str) -> list[Pedigree]:
|
|
720
|
+
"""
|
|
721
|
+
Adds a parent-child relation and merges nodes appropriately.
|
|
722
|
+
Returns a list containing the resulting Pedigree, if successful.
|
|
723
|
+
"""
|
|
724
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
725
|
+
|
|
726
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
727
|
+
if not PedigreeReconstructor._check_parent_child_haplogroups(pedigree, node1, node2):
|
|
728
|
+
return []
|
|
729
|
+
|
|
730
|
+
ret: list[Pedigree] = []
|
|
731
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
732
|
+
new_pedigree.fill_node_parents(node2)
|
|
733
|
+
original_parent: str
|
|
734
|
+
if new_pedigree.get_data(node1)["sex"] == "M":
|
|
735
|
+
original_parent = new_pedigree.get_father(node2)
|
|
736
|
+
else:
|
|
737
|
+
original_parent = new_pedigree.get_mother(node2)
|
|
738
|
+
|
|
739
|
+
if new_pedigree.check_valid_merge(node1, original_parent):
|
|
740
|
+
if new_pedigree.merge_nodes(node1, original_parent):
|
|
741
|
+
ret.append(new_pedigree)
|
|
742
|
+
return ret
|
|
743
|
+
|
|
744
|
+
@staticmethod
|
|
745
|
+
def _connect_sibling_relation(pedigree: Pedigree, node1: str, node2: str) -> list[Pedigree]:
|
|
746
|
+
"""
|
|
747
|
+
Adds a sibling relation and merges nodes appropriately.
|
|
748
|
+
Returns a list containing the resulting Pedigree, if successful.
|
|
749
|
+
"""
|
|
750
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
751
|
+
|
|
752
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
753
|
+
if not PedigreeReconstructor._check_sibling_haplogroups(pedigree, node1, node2):
|
|
754
|
+
return []
|
|
755
|
+
|
|
756
|
+
ret: list[Pedigree] = []
|
|
757
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
758
|
+
new_pedigree.fill_node_parents(node1)
|
|
759
|
+
new_pedigree.fill_node_parents(node2)
|
|
760
|
+
|
|
761
|
+
father1 = new_pedigree.get_father(node1)
|
|
762
|
+
father2 = new_pedigree.get_father(node2)
|
|
763
|
+
if new_pedigree.check_valid_merge(father1, father2):
|
|
764
|
+
if new_pedigree.merge_nodes(father1, father2):
|
|
765
|
+
mother1 = new_pedigree.get_mother(node1)
|
|
766
|
+
mother2 = new_pedigree.get_mother(node2)
|
|
767
|
+
if new_pedigree.check_valid_merge(mother1, mother2):
|
|
768
|
+
if new_pedigree.merge_nodes(mother1, mother2):
|
|
769
|
+
new_pedigree.add_sibling_relation(node1, node2)
|
|
770
|
+
ret.append(new_pedigree)
|
|
771
|
+
return ret
|
|
772
|
+
|
|
773
|
+
@staticmethod
|
|
774
|
+
def _connect_aunt_uncle_relation(
|
|
775
|
+
pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
|
|
776
|
+
) -> list[Pedigree]:
|
|
777
|
+
"""
|
|
778
|
+
Adds an aunt/uncle-nephew/niece relation and merges nodes appropriately.
|
|
779
|
+
Returns a list containing the resulting Pedigree(s), if successful.
|
|
780
|
+
"""
|
|
781
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
782
|
+
assert shared_relative_sex in ["M", "F", None]
|
|
783
|
+
|
|
784
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
785
|
+
if not PedigreeReconstructor._check_aunt_uncle_nephew_niece_haplogroups(
|
|
786
|
+
pedigree, node1, node2, shared_relative_sex
|
|
787
|
+
):
|
|
788
|
+
return []
|
|
789
|
+
|
|
790
|
+
ret: list[Pedigree] = []
|
|
791
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
792
|
+
new_pedigree.fill_node_parents(node2)
|
|
793
|
+
|
|
794
|
+
node2_parents: list[str]
|
|
795
|
+
if shared_relative_sex == "M":
|
|
796
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
797
|
+
elif shared_relative_sex == "F":
|
|
798
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
799
|
+
else:
|
|
800
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
801
|
+
|
|
802
|
+
for node2_parent in node2_parents:
|
|
803
|
+
if node1 != node2_parent:
|
|
804
|
+
ret.extend(PedigreeReconstructor._connect_sibling_relation(new_pedigree, node1, node2_parent))
|
|
805
|
+
return ret
|
|
806
|
+
|
|
807
|
+
@staticmethod
|
|
808
|
+
def _connect_grandparent_relation(
|
|
809
|
+
pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
|
|
810
|
+
) -> list[Pedigree]:
|
|
811
|
+
"""
|
|
812
|
+
Adds a grandparent-grandchild relation and merges nodes appropriately.
|
|
813
|
+
Returns a list containing the resulting Pedigree(s), if successful.
|
|
814
|
+
"""
|
|
815
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
816
|
+
assert shared_relative_sex in ["M", "F", None]
|
|
817
|
+
|
|
818
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
819
|
+
if not PedigreeReconstructor._check_grandparent_grandchild_haplogroups(
|
|
820
|
+
pedigree, node1, node2, shared_relative_sex
|
|
821
|
+
):
|
|
822
|
+
return []
|
|
823
|
+
|
|
824
|
+
ret: list[Pedigree] = []
|
|
825
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
826
|
+
new_pedigree.fill_node_parents(node2)
|
|
827
|
+
|
|
828
|
+
node2_parents: list[str]
|
|
829
|
+
if shared_relative_sex == "M":
|
|
830
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
831
|
+
elif shared_relative_sex == "F":
|
|
832
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
833
|
+
else:
|
|
834
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
835
|
+
|
|
836
|
+
for node2_parent in node2_parents:
|
|
837
|
+
if node1 != node2_parent:
|
|
838
|
+
ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node1, node2_parent))
|
|
839
|
+
return ret
|
|
840
|
+
|
|
841
|
+
@staticmethod
|
|
842
|
+
def _connect_half_sibling_relation(
|
|
843
|
+
pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
|
|
844
|
+
) -> list[Pedigree]:
|
|
845
|
+
"""
|
|
846
|
+
Adds a half-sibling relation and merges nodes appropriately.
|
|
847
|
+
Returns a list containing the resulting Pedigree(s), if successful.
|
|
848
|
+
"""
|
|
849
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
850
|
+
|
|
851
|
+
# Pre-check invalid relations to avoid unnecessary deep-copying
|
|
852
|
+
if not PedigreeReconstructor._check_half_sibling_haplogroups(pedigree, node1, node2, shared_relative_sex):
|
|
853
|
+
return []
|
|
854
|
+
|
|
855
|
+
ret: list[Pedigree] = []
|
|
856
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
857
|
+
new_pedigree.fill_node_parents(node1)
|
|
858
|
+
new_pedigree.fill_node_parents(node2)
|
|
859
|
+
|
|
860
|
+
node1_parents: list[str]
|
|
861
|
+
node2_parents: list[str]
|
|
862
|
+
if shared_relative_sex == "M":
|
|
863
|
+
node1_parents = [new_pedigree.get_father(node1)]
|
|
864
|
+
node2_parents = [new_pedigree.get_father(node2)]
|
|
865
|
+
elif shared_relative_sex == "F":
|
|
866
|
+
node1_parents = [new_pedigree.get_mother(node1)]
|
|
867
|
+
node2_parents = [new_pedigree.get_mother(node2)]
|
|
868
|
+
else:
|
|
869
|
+
node1_parents = [new_pedigree.get_father(node1), new_pedigree.get_mother(node1)]
|
|
870
|
+
node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
|
|
871
|
+
|
|
872
|
+
# Node 1 and Node 2 are half-siblings via one of Node 1's parents
|
|
873
|
+
for node1_parent in node1_parents:
|
|
874
|
+
if node1_parent != node2:
|
|
875
|
+
ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node1_parent, node2))
|
|
876
|
+
# Node 1 and Node 2 are half-siblings via one of Node 2's parents
|
|
877
|
+
for node2_parent in node2_parents:
|
|
878
|
+
if node2_parent != node1:
|
|
879
|
+
ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node2_parent, node1))
|
|
880
|
+
return ret
|
|
881
|
+
|
|
882
|
+
@staticmethod
|
|
883
|
+
def _connect_double_cousin_relation(
|
|
884
|
+
pedigree: Pedigree, node1: str, node2: str, same_sex_siblings: bool | None = None
|
|
885
|
+
) -> list[Pedigree]:
|
|
886
|
+
"""
|
|
887
|
+
Adds a double (first) cousin relation and merges nodes appropriately.
|
|
888
|
+
Returns a list containing the resulting Pedigree(s), if successful.
|
|
889
|
+
"""
|
|
890
|
+
assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
|
|
891
|
+
|
|
892
|
+
ret: list[Pedigree] = []
|
|
893
|
+
new_pedigree = copy.deepcopy(pedigree)
|
|
894
|
+
new_pedigree.fill_node_parents(node1)
|
|
895
|
+
new_pedigree.fill_node_parents(node2)
|
|
896
|
+
|
|
897
|
+
if same_sex_siblings is None or same_sex_siblings:
|
|
898
|
+
father1 = new_pedigree.get_father(node1)
|
|
899
|
+
father2 = new_pedigree.get_father(node2)
|
|
900
|
+
if father1 != father2:
|
|
901
|
+
temp_same_sex_pedigrees = PedigreeReconstructor._connect_sibling_relation(
|
|
902
|
+
new_pedigree, father1, father2
|
|
903
|
+
)
|
|
904
|
+
for same_sex_pedigree in temp_same_sex_pedigrees:
|
|
905
|
+
# Get parents again in case they changed during previous merge
|
|
906
|
+
mother1 = same_sex_pedigree.get_mother(node1)
|
|
907
|
+
mother2 = same_sex_pedigree.get_mother(node2)
|
|
908
|
+
if mother1 != mother2:
|
|
909
|
+
ret.extend(PedigreeReconstructor._connect_sibling_relation(same_sex_pedigree, mother1, mother2))
|
|
910
|
+
|
|
911
|
+
if same_sex_siblings is None or not same_sex_siblings:
|
|
912
|
+
father1 = new_pedigree.get_father(node1)
|
|
913
|
+
mother2 = new_pedigree.get_mother(node2)
|
|
914
|
+
temp_opposite_sex_pedigrees = PedigreeReconstructor._connect_sibling_relation(
|
|
915
|
+
new_pedigree, father1, mother2
|
|
916
|
+
)
|
|
917
|
+
for opposite_sex_pedigree in temp_opposite_sex_pedigrees:
|
|
918
|
+
# Get parents again in case they changed during previous merge
|
|
919
|
+
mother1 = opposite_sex_pedigree.get_mother(node1)
|
|
920
|
+
father2 = opposite_sex_pedigree.get_father(node2)
|
|
921
|
+
ret.extend(PedigreeReconstructor._connect_sibling_relation(opposite_sex_pedigree, mother1, father2))
|
|
922
|
+
return ret
|
|
923
|
+
|
|
924
|
+
def _clean_pedigree_data(self) -> None:
|
|
925
|
+
"""
|
|
926
|
+
Remove unnecessary entries in Pedigree dicts.
|
|
927
|
+
"""
|
|
928
|
+
for pedigree in self._candidate_pedigrees:
|
|
929
|
+
pedigree.clean_data()
|
|
930
|
+
|
|
931
|
+
for pedigree in self._final_pedigrees:
|
|
932
|
+
pedigree.clean_data()
|
|
933
|
+
|
|
934
|
+
def _validate_pedigree_structures(self) -> None:
|
|
935
|
+
"""
|
|
936
|
+
Validate that all candidate pedigrees are consistent.
|
|
937
|
+
"""
|
|
938
|
+
for pedigree in self._candidate_pedigrees:
|
|
939
|
+
assert pedigree.validate_structure()
|
|
940
|
+
|
|
941
|
+
def _get_pair_to_constraints(self) -> defaultdict[tuple[str, str], list[tuple[str, ...]]]:
|
|
942
|
+
"""
|
|
943
|
+
Turn DataFrame of relations/constraints into dict(s) of {node pairs: list of possible relations}.
|
|
944
|
+
Dict values are lists of tuples (as opposed to just tuples)
|
|
945
|
+
because a pair of nodes can share more than 1 relation.
|
|
946
|
+
"""
|
|
947
|
+
pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = defaultdict(list)
|
|
948
|
+
for node1, node2, _, constraints, _ in self._all_relations.itertuples(index=False):
|
|
949
|
+
pair_to_constraints[(node1, node2)].append(tuple(constraints.split(";")))
|
|
950
|
+
for node_pair in pair_to_constraints:
|
|
951
|
+
# Sort by number of constraints so specific constraints are checked first when pruning
|
|
952
|
+
pair_to_constraints[node_pair].sort(key=lambda x: len(x))
|
|
953
|
+
return pair_to_constraints
|
|
954
|
+
|
|
955
|
+
def _get_pair_to_relations_so_far(
|
|
956
|
+
self, processed_relations: pd.DataFrame
|
|
957
|
+
) -> defaultdict[tuple[str, str], list[tuple[str, str, bool]]]:
|
|
958
|
+
"""
|
|
959
|
+
Turn DataFrame of relations/constraints processed so far
|
|
960
|
+
into dict(s) of {node pairs: list of (degree, constraints) tuples}.
|
|
961
|
+
"""
|
|
962
|
+
pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]] = defaultdict(list)
|
|
963
|
+
for node1, node2, degree, constraints, force_constraints in processed_relations.itertuples(index=False):
|
|
964
|
+
pair_to_relations_so_far[(node1, node2)].append((degree, constraints, force_constraints))
|
|
965
|
+
return pair_to_relations_so_far
|
|
966
|
+
|
|
967
|
+
def _prune_pedigrees(
|
|
968
|
+
self,
|
|
969
|
+
pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]],
|
|
970
|
+
check_half_siblings: bool,
|
|
971
|
+
) -> None:
|
|
972
|
+
"""
|
|
973
|
+
Remove pedigrees with inconsistencies.
|
|
974
|
+
"""
|
|
975
|
+
seen_topologies = set()
|
|
976
|
+
new_potential_pedigrees = []
|
|
977
|
+
for pedigree in self._candidate_pedigrees:
|
|
978
|
+
if (
|
|
979
|
+
pedigree.validate_members(set(self._node_data["id"]))
|
|
980
|
+
and pedigree.validate_can_have_children()
|
|
981
|
+
and pedigree.validate_inbreeding()
|
|
982
|
+
and pedigree.validate_years_before_present()
|
|
983
|
+
and pedigree.validate_forced_constraints(pair_to_relations_so_far)
|
|
984
|
+
):
|
|
985
|
+
pedigree.update_haplogroups()
|
|
986
|
+
if pedigree.validate_haplogroups():
|
|
987
|
+
topology = pedigree.get_topo_sort()
|
|
988
|
+
if topology not in seen_topologies:
|
|
989
|
+
new_potential_pedigrees.append(pedigree)
|
|
990
|
+
seen_topologies.add(topology)
|
|
991
|
+
# Shuffle to avoid ordering bias in epsilon-greedy sampling
|
|
992
|
+
self._rng.shuffle(new_potential_pedigrees)
|
|
993
|
+
|
|
994
|
+
strikes = []
|
|
995
|
+
third_degree_strikes = []
|
|
996
|
+
counts: defaultdict[int, int] = defaultdict(int)
|
|
997
|
+
for pedigree in new_potential_pedigrees:
|
|
998
|
+
num_strikes, _ = pedigree.count_inconsistencies(
|
|
999
|
+
self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings
|
|
1000
|
+
)
|
|
1001
|
+
num_third_degree_strikes = pedigree.count_third_degree_inconsistencies(self._pair_to_constraints)
|
|
1002
|
+
strikes.append(num_strikes)
|
|
1003
|
+
third_degree_strikes.append(num_third_degree_strikes)
|
|
1004
|
+
counts[num_strikes] += 1
|
|
1005
|
+
logger.info(f"Strike counts before pruning: {str(dict(sorted(counts.items())))}")
|
|
1006
|
+
|
|
1007
|
+
def epsilon_greedy_sample(
|
|
1008
|
+
pedigrees: list[Pedigree],
|
|
1009
|
+
strikes: list[int],
|
|
1010
|
+
third_degree_strikes: list[int],
|
|
1011
|
+
epsilon: float,
|
|
1012
|
+
max_candidate_pedigrees: int,
|
|
1013
|
+
) -> list[Pedigree]:
|
|
1014
|
+
assert len(pedigrees) == len(strikes)
|
|
1015
|
+
if len(pedigrees) <= max_candidate_pedigrees:
|
|
1016
|
+
return pedigrees
|
|
1017
|
+
|
|
1018
|
+
sorted_pedigrees = [
|
|
1019
|
+
pedigree
|
|
1020
|
+
for pedigree, _, _ in sorted(
|
|
1021
|
+
zip(pedigrees, strikes, third_degree_strikes, strict=True), key=lambda x: (x[1], x[2])
|
|
1022
|
+
)
|
|
1023
|
+
]
|
|
1024
|
+
exploitation_max_candidate_pedigrees = int((1 - epsilon) * max_candidate_pedigrees)
|
|
1025
|
+
exploration_max_candidate_pedigrees = max_candidate_pedigrees - exploitation_max_candidate_pedigrees
|
|
1026
|
+
|
|
1027
|
+
exploitation_pedigrees = sorted_pedigrees[:exploitation_max_candidate_pedigrees]
|
|
1028
|
+
exploration_pedigrees = self._rng.sample(
|
|
1029
|
+
sorted_pedigrees[exploitation_max_candidate_pedigrees:], exploration_max_candidate_pedigrees
|
|
1030
|
+
)
|
|
1031
|
+
return exploitation_pedigrees + exploration_pedigrees
|
|
1032
|
+
|
|
1033
|
+
num_processed_relations = sum(len(relations) for relations in pair_to_relations_so_far.values())
|
|
1034
|
+
if num_processed_relations < len(self._first_and_second_degree_relations):
|
|
1035
|
+
self._candidate_pedigrees = epsilon_greedy_sample(
|
|
1036
|
+
new_potential_pedigrees,
|
|
1037
|
+
strikes,
|
|
1038
|
+
third_degree_strikes,
|
|
1039
|
+
epsilon=self._epsilon,
|
|
1040
|
+
max_candidate_pedigrees=self._max_candidate_pedigrees,
|
|
1041
|
+
)
|
|
1042
|
+
else:
|
|
1043
|
+
# Final iteration
|
|
1044
|
+
best_pedigrees = [
|
|
1045
|
+
pedigree
|
|
1046
|
+
for pedigree, num_strikes in zip(new_potential_pedigrees, strikes, strict=True)
|
|
1047
|
+
if num_strikes == min(strikes)
|
|
1048
|
+
]
|
|
1049
|
+
# Use 3rd-degree strikes as tiebreaker
|
|
1050
|
+
third_degree_strikes = [
|
|
1051
|
+
pedigree.count_third_degree_inconsistencies(self._pair_to_constraints) for pedigree in best_pedigrees
|
|
1052
|
+
]
|
|
1053
|
+
|
|
1054
|
+
self._final_pedigrees.extend(
|
|
1055
|
+
[
|
|
1056
|
+
pedigree
|
|
1057
|
+
for pedigree, num_strikes in zip(best_pedigrees, third_degree_strikes, strict=True)
|
|
1058
|
+
if num_strikes == min(third_degree_strikes)
|
|
1059
|
+
]
|
|
1060
|
+
)
|
|
1061
|
+
self._final_strike_counts = []
|
|
1062
|
+
self._final_strike_logs = []
|
|
1063
|
+
for pedigree in self._final_pedigrees:
|
|
1064
|
+
strike_count, strike_log = pedigree.count_inconsistencies(
|
|
1065
|
+
self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings=True
|
|
1066
|
+
)
|
|
1067
|
+
self._final_strike_counts.append(strike_count)
|
|
1068
|
+
self._final_strike_logs.append(strike_log)
|
|
1069
|
+
|
|
1070
|
+
def _write_corrected_input_relations(
|
|
1071
|
+
self, strike_count: int, strike_log: list[tuple[str, str, str, str]], path: Path | str
|
|
1072
|
+
) -> None:
|
|
1073
|
+
"""
|
|
1074
|
+
Write corrected input relations to file. Includes information about added/removed/changed input relations.
|
|
1075
|
+
"""
|
|
1076
|
+
path = Path(path)
|
|
1077
|
+
added_relations = []
|
|
1078
|
+
removed_relations = []
|
|
1079
|
+
for node1, node2, degree, constraints in strike_log:
|
|
1080
|
+
if degree[0] == "+":
|
|
1081
|
+
added_relations.append((node1, node2, degree[1], constraints))
|
|
1082
|
+
else:
|
|
1083
|
+
removed_relations.append((node1, node2, degree[1], constraints))
|
|
1084
|
+
removed_relations_set = set(removed_relations)
|
|
1085
|
+
|
|
1086
|
+
# Separate out *changed* relations (added relation + removed relation pair, e.g., 1st-degree -> 2nd-degree)
|
|
1087
|
+
changed_node_pairs = set()
|
|
1088
|
+
for add_node1, add_node2, _, _ in added_relations:
|
|
1089
|
+
for remove_node1, remove_node2, _, _ in removed_relations:
|
|
1090
|
+
if (add_node1 == remove_node1 and add_node2 == remove_node2) or (
|
|
1091
|
+
add_node2 == remove_node1 and add_node1 == remove_node2
|
|
1092
|
+
):
|
|
1093
|
+
# Removed pairs follow the input pair order and will be written to file in that order, so use those
|
|
1094
|
+
# Then, we can sort changed_node_pairs on the tuple order that will actually be written to file
|
|
1095
|
+
changed_node_pairs.add((remove_node1, remove_node2))
|
|
1096
|
+
|
|
1097
|
+
with path.open("w") as file:
|
|
1098
|
+
file.write("id1,id2,degree,constraints\n") # Header line
|
|
1099
|
+
file.write(f"# Final inconsistency count: {strike_count}\n")
|
|
1100
|
+
file.write(
|
|
1101
|
+
"# Note: 3rd-degree relations are not explicitly reconstructed and will not appear as modified here\n"
|
|
1102
|
+
)
|
|
1103
|
+
|
|
1104
|
+
def write_relations_line(node1, node2, degree, constraints, commented=False):
|
|
1105
|
+
if constraints == self._DEFAULT_CONSTRAINTS[degree]:
|
|
1106
|
+
# Don't write default constraints to file
|
|
1107
|
+
constraints = ""
|
|
1108
|
+
if commented:
|
|
1109
|
+
file.write("# ")
|
|
1110
|
+
file.write(f"{node1},{node2},{degree},{constraints}\n")
|
|
1111
|
+
|
|
1112
|
+
file.write("# Added relations\n")
|
|
1113
|
+
# Sort for consistency
|
|
1114
|
+
for node1, node2, degree, constraints in sorted(added_relations):
|
|
1115
|
+
if (node1, node2) not in changed_node_pairs and (node2, node1) not in changed_node_pairs:
|
|
1116
|
+
write_relations_line(node1, node2, degree, constraints)
|
|
1117
|
+
|
|
1118
|
+
file.write("\n# Removed relations\n")
|
|
1119
|
+
for node1, node2, degree, constraints in sorted(removed_relations):
|
|
1120
|
+
if (node1, node2) not in changed_node_pairs and (node2, node1) not in changed_node_pairs:
|
|
1121
|
+
write_relations_line(node1, node2, degree, constraints, commented=True)
|
|
1122
|
+
|
|
1123
|
+
file.write("\n# Changed relations\n")
|
|
1124
|
+
# Pair up changed relations (add + remove)
|
|
1125
|
+
for node1, node2 in sorted(changed_node_pairs):
|
|
1126
|
+
# We want to write the two nodes in the correct (original) order
|
|
1127
|
+
node1_to_write = None
|
|
1128
|
+
node2_to_write = None
|
|
1129
|
+
for node1_remove, node2_remove, degree_remove, constraints_remove in removed_relations:
|
|
1130
|
+
if (node1_remove, node2_remove) == (node1, node2) or (node2_remove, node1_remove) == (node1, node2):
|
|
1131
|
+
write_relations_line(
|
|
1132
|
+
node1_remove, node2_remove, degree_remove, constraints_remove, commented=True
|
|
1133
|
+
)
|
|
1134
|
+
# The removed nodes follow the original input order
|
|
1135
|
+
node1_to_write = node1_remove
|
|
1136
|
+
node2_to_write = node2_remove
|
|
1137
|
+
for node1_add, node2_add, degree_add, constraints_add in added_relations:
|
|
1138
|
+
if (node1_add, node2_add) == (node1, node2) or (node2_add, node1_add) == (node1, node2):
|
|
1139
|
+
assert node1_to_write and node2_to_write
|
|
1140
|
+
write_relations_line(node1_to_write, node2_to_write, degree_add, constraints_add)
|
|
1141
|
+
|
|
1142
|
+
file.write("\n# Unchanged relations\n")
|
|
1143
|
+
for node1, node2, degree, constraints, _ in self._all_relations.itertuples(index=False):
|
|
1144
|
+
if (node1, node2, degree, constraints) not in removed_relations_set:
|
|
1145
|
+
assert (node2, node1, degree, constraints) not in removed_relations_set
|
|
1146
|
+
write_relations_line(node1, node2, degree, constraints)
|
|
1147
|
+
|
|
1148
|
+
def _write_constant_relations(self, path: Path | str) -> None:
|
|
1149
|
+
"""
|
|
1150
|
+
Write relations that are identical across all final pedigrees.
|
|
1151
|
+
"""
|
|
1152
|
+
path = Path(path)
|
|
1153
|
+
node_sets = [pedigree.get_non_placeholder_nodes() for pedigree in self._final_pedigrees]
|
|
1154
|
+
nodes = next(iter(node_sets))
|
|
1155
|
+
assert all(node_set == nodes for node_set in node_sets)
|
|
1156
|
+
|
|
1157
|
+
nodes = sorted(nodes)
|
|
1158
|
+
with path.open("w") as file:
|
|
1159
|
+
file.write("# Constant kinship relations across all output pedigrees\n")
|
|
1160
|
+
file.write("node1,node2,relation\n")
|
|
1161
|
+
for i in range(len(nodes)):
|
|
1162
|
+
node1 = nodes[i]
|
|
1163
|
+
for j in range(i + 1, len(nodes)):
|
|
1164
|
+
node2 = nodes[j]
|
|
1165
|
+
shared_relations: dict[str, int] | None = None
|
|
1166
|
+
for pedigree in self._final_pedigrees:
|
|
1167
|
+
relations = pedigree.get_relations_between_nodes(node1, node2, include_maternal_paternal=True)
|
|
1168
|
+
if shared_relations is None:
|
|
1169
|
+
shared_relations = dict(relations)
|
|
1170
|
+
else:
|
|
1171
|
+
to_remove = [
|
|
1172
|
+
relation
|
|
1173
|
+
for relation, count in shared_relations.items()
|
|
1174
|
+
if relations.get(relation, 0) != count
|
|
1175
|
+
]
|
|
1176
|
+
for relation in to_remove:
|
|
1177
|
+
del shared_relations[relation]
|
|
1178
|
+
if not shared_relations:
|
|
1179
|
+
break
|
|
1180
|
+
|
|
1181
|
+
if not shared_relations:
|
|
1182
|
+
continue
|
|
1183
|
+
|
|
1184
|
+
for relation, count in shared_relations.items():
|
|
1185
|
+
for _ in range(count):
|
|
1186
|
+
file.write(f"{node1},{node2},{relation}\n")
|