repare 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1186 @@
1
+ import copy
2
+ import logging
3
+ import random
4
+ import time
5
+ from collections import defaultdict
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ from tqdm import tqdm
11
+
12
+ from repare.pedigree import Pedigree
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class PedigreeReconstructor:
18
+ """
19
+ Manages and builds up a collection of potential Pedigrees.
20
+ """
21
+
22
+ ALLOWED_CONSTRAINTS: frozenset[str] = frozenset(
23
+ {
24
+ "parent-child",
25
+ "child-parent",
26
+ "siblings",
27
+ "maternal aunt/uncle-nephew/niece",
28
+ "maternal nephew/niece-aunt/uncle",
29
+ "paternal aunt/uncle-nephew/niece",
30
+ "paternal nephew/niece-aunt/uncle",
31
+ "maternal grandparent-grandchild",
32
+ "maternal grandchild-grandparent",
33
+ "paternal grandparent-grandchild",
34
+ "paternal grandchild-grandparent",
35
+ "maternal half-siblings",
36
+ "paternal half-siblings",
37
+ "double cousins",
38
+ }
39
+ )
40
+
41
+ @classmethod
42
+ def get_allowed_constraints(cls) -> frozenset[str]:
43
+ """
44
+ Returns the set of allowed constraint strings.
45
+ """
46
+ return cls.ALLOWED_CONSTRAINTS
47
+
48
+ def __init__(
49
+ self,
50
+ relations_path: Path | str,
51
+ nodes_path: Path | str,
52
+ outputs_dir: Path | str,
53
+ max_candidate_pedigrees: int = 1000,
54
+ epsilon: float = 0.2,
55
+ plot: bool = True,
56
+ plot_haplogroups: bool = True,
57
+ write_alternate_pedigrees: bool = False,
58
+ random_seed: Any = 42,
59
+ ) -> None:
60
+ relations_path = Path(relations_path)
61
+ nodes_path = Path(nodes_path)
62
+ outputs_dir = Path(outputs_dir)
63
+ self._start_time = time.time()
64
+ self._validate_node_data(nodes_path)
65
+ self._process_node_data()
66
+ self._validate_relation_data(relations_path)
67
+ self._process_relation_data()
68
+
69
+ self._outputs_dir = outputs_dir
70
+ # Number of pedigrees to downsample to after each iteration of algorithm
71
+ self._max_candidate_pedigrees = max_candidate_pedigrees
72
+ # Parameter for epsilon-greedy sampling when pruning pedigrees
73
+ self._epsilon = epsilon
74
+ # Whether to plot the reconstructed pedigree(s)
75
+ self._plot = plot
76
+ # Whether to plot haplogroups of the reconstructed pedigree(s)
77
+ self._plot_haplogroups = plot_haplogroups
78
+ # Whether to write corrected relations and plots of alternate final pedigrees
79
+ self._write_alternate_pedigrees = write_alternate_pedigrees
80
+ self._random_seed = random_seed
81
+ self._rng = random.Random(self._random_seed)
82
+ self._validate_arguments()
83
+
84
+ # Maximum number of times to run the algorithm if no valid pedigree is found
85
+ self._MAX_RUNS = 10
86
+ self._candidate_pedigrees: list[Pedigree] = [self._get_initial_pedigree()]
87
+ self._pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = self._get_pair_to_constraints()
88
+ self._final_pedigrees: list[Pedigree] = []
89
+
90
+ def _validate_node_data(self, nodes_path: Path | str) -> None:
91
+ """
92
+ Validate node data input.
93
+ """
94
+ self._node_data = pd.read_csv(nodes_path, dtype=str, comment="#", keep_default_na=False)
95
+ for mandatory_column in ["id", "sex", "y_haplogroup", "mt_haplogroup"]:
96
+ if mandatory_column not in self._node_data.columns:
97
+ raise ValueError(f'Column "{mandatory_column}" not found in input node data.')
98
+
99
+ for optional_column in ["can_have_children", "can_be_inbred", "years_before_present"]:
100
+ if optional_column not in self._node_data.columns:
101
+ self._node_data[optional_column] = ""
102
+
103
+ # Numeric IDs are used for placeholder nodes
104
+ if self._node_data["id"].str.isnumeric().any():
105
+ raise ValueError("Sample IDs cannot be completely numeric.")
106
+
107
+ if self._node_data["id"].duplicated().any():
108
+ raise ValueError("Sample IDs must be unique.")
109
+
110
+ if self._node_data["id"].str.strip().eq("").any():
111
+ raise ValueError("Sample IDs cannot be empty.")
112
+
113
+ if not self._node_data["sex"].isin(["M", "F"]).all():
114
+ raise ValueError('Node sex must be "M" or "F".')
115
+
116
+ for haplogroup_column in ["y_haplogroup", "mt_haplogroup"]:
117
+ for haplogroup in self._node_data[haplogroup_column]:
118
+ if "*" in haplogroup[:-1]:
119
+ raise ValueError(
120
+ "Expandable haplogroups should contain one trailing asterisk. "
121
+ "No other asterisks are allowed in haplogroups."
122
+ )
123
+
124
+ if not self._node_data["can_have_children"].isin(["True", "False", ""]).all():
125
+ raise ValueError('can_have_children value must be "True", "False", or empty.')
126
+ if not self._node_data["can_be_inbred"].isin(["True", "False", ""]).all():
127
+ raise ValueError('can_be_inbred value must be "True", "False", or empty.')
128
+ if not self._node_data["years_before_present"].apply(lambda x: x.isnumeric() or x == "").all():
129
+ raise ValueError("years_before_present value must be integer or empty.")
130
+
131
+ def _process_node_data(self) -> None:
132
+ """
133
+ Process node data input.
134
+ """
135
+ # Reorder node data columns and remove unnecessary columns
136
+ self._node_data = self._node_data[
137
+ ["id", "sex", "y_haplogroup", "mt_haplogroup", "can_have_children", "can_be_inbred", "years_before_present"]
138
+ ]
139
+ # Convert "can_have_children" and "can_be_inbred" columns to booleans
140
+ self._node_data["can_have_children"] = self._node_data["can_have_children"].map(
141
+ {"False": False, "True": True, "": True}
142
+ )
143
+ self._node_data["can_be_inbred"] = self._node_data["can_be_inbred"].map(
144
+ {"False": False, "True": True, "": True}
145
+ )
146
+ # Convert "years_before_present" column to floats
147
+ self._node_data["years_before_present"] = pd.to_numeric(
148
+ self._node_data["years_before_present"], errors="coerce"
149
+ )
150
+
151
+ def _validate_relation_data(self, relations_path: Path | str) -> None:
152
+ """
153
+ Validate relation data input.
154
+ """
155
+ self._relation_data = pd.read_csv(relations_path, dtype=str, comment="#", keep_default_na=False)
156
+ for column_name in ["id1", "id2", "degree", "constraints"]:
157
+ if column_name not in self._relation_data.columns:
158
+ raise ValueError(f'Column "{column_name}" not found in input relation data.')
159
+
160
+ for optional_column in ["force_constraints"]:
161
+ if optional_column not in self._relation_data.columns:
162
+ self._relation_data[optional_column] = ""
163
+
164
+ excess_relation_nodes = set(self._relation_data["id1"]).union(set(self._relation_data["id2"])) - set(
165
+ self._node_data["id"]
166
+ )
167
+ if excess_relation_nodes:
168
+ raise ValueError(f"All node IDs in relation data must be present in node data: {excess_relation_nodes}.")
169
+
170
+ if not self._relation_data["degree"].isin(["1", "2", "3"]).all():
171
+ raise ValueError("Degree must be 1, 2, or 3.")
172
+ if not self._relation_data["force_constraints"].isin(["True", "False", ""]).all():
173
+ raise ValueError('can_have_children value must be "True", "False", or empty.')
174
+
175
+ self._relation_data["pair_degree"] = self._relation_data.apply(
176
+ lambda row: tuple(sorted([row["id1"], row["id2"], row["degree"]])), axis=1
177
+ )
178
+ grouped_relations = self._relation_data.groupby("pair_degree")
179
+ # Check for groups with multiple non-empty constraints, which can lead to issues when counting inconsistencies
180
+ invalid_groups = grouped_relations.filter(lambda group: (group["constraints"] != "").sum() > 1)
181
+ if not invalid_groups.empty:
182
+ raise ValueError("Node pairs cannot have multiple non-empty constraints of the same degree.")
183
+ self._relation_data.drop("pair_degree", axis=1, inplace=True)
184
+
185
+ allowed_constraints = self.get_allowed_constraints()
186
+
187
+ def split_and_validate_constraints(constraints: str) -> None:
188
+ if constraints:
189
+ constraints_list = [c for c in constraints.split(";")]
190
+ if any(c not in allowed_constraints for c in constraints_list):
191
+ raise ValueError(
192
+ f"Invalid constraints found: {[c for c in constraints_list if c not in allowed_constraints]}."
193
+ )
194
+
195
+ self._relation_data["constraints"].apply(split_and_validate_constraints)
196
+
197
+ def _process_relation_data(self) -> None:
198
+ """
199
+ Process relation data input.
200
+ """
201
+ # Reorder relation data columns and remove unnecessary columns
202
+ self._relation_data = self._relation_data[["id1", "id2", "degree", "constraints", "force_constraints"]]
203
+ # Convert "force_constrains" column to booleans
204
+ self._relation_data["force_constraints"] = self._relation_data["force_constraints"].map(
205
+ {"False": False, "True": True, "": False}
206
+ )
207
+
208
+ def sort_nodes(row: pd.Series) -> pd.Series:
209
+ """
210
+ Ensure id1 and id2 are in a fixed (sorted) order and flip constraints as needed.
211
+ """
212
+ # Map constraints to their flipped value
213
+ flipped_constraints = {
214
+ "parent-child": "child-parent",
215
+ "child-parent": "parent-child",
216
+ "maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
217
+ "paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
218
+ "maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
219
+ "paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
220
+ "maternal grandparent-grandchild": "maternal grandchild-grandparent",
221
+ "paternal grandparent-grandchild": "paternal grandchild-grandparent",
222
+ "maternal grandchild-grandparent": "maternal grandparent-grandchild",
223
+ "paternal grandchild-grandparent": "paternal grandparent-grandchild",
224
+ "siblings": "siblings", # Symmetric
225
+ "maternal half-siblings": "maternal half-siblings", # Symmetric
226
+ "paternal half-siblings": "paternal half-siblings", # Symmetric
227
+ "double cousins": "double cousins", # Symmetric
228
+ }
229
+ if row["id2"] < row["id1"]:
230
+ constraints = row["constraints"]
231
+ # Split constraints and map each to its flipped value
232
+ if constraints:
233
+ constraints_list = [c.strip() for c in constraints.split(";")]
234
+ flipped = [flipped_constraints[c] for c in constraints_list]
235
+ relation_flipped_constraints = ";".join(flipped)
236
+ else:
237
+ relation_flipped_constraints = ""
238
+ # Swap id1 and id2, and flip constraints
239
+ return pd.Series(
240
+ {
241
+ "id1": row["id2"],
242
+ "id2": row["id1"],
243
+ "degree": row["degree"],
244
+ "constraints": relation_flipped_constraints,
245
+ "force_constraints": row["force_constraints"],
246
+ }
247
+ )
248
+ else:
249
+ return row
250
+
251
+ self._relation_data = self._relation_data.apply(sort_nodes, axis=1)
252
+
253
+ # Note: We don't use maternal/paternal 3rd-degree relations because those are not well-defined
254
+ self._DEFAULT_CONSTRAINTS = {
255
+ "1": ("parent-child;child-parent;siblings"),
256
+ "2": (
257
+ "maternal aunt/uncle-nephew/niece;"
258
+ "maternal nephew/niece-aunt/uncle;"
259
+ "paternal aunt/uncle-nephew/niece;"
260
+ "paternal nephew/niece-aunt/uncle;"
261
+ "maternal grandparent-grandchild;"
262
+ "maternal grandchild-grandparent;"
263
+ "paternal grandparent-grandchild;"
264
+ "paternal grandchild-grandparent;"
265
+ "maternal half-siblings;"
266
+ "paternal half-siblings;"
267
+ "double cousins"
268
+ ),
269
+ "3": (
270
+ "half aunt/uncle-half nephew/niece;"
271
+ "half nephew/niece-half aunt/uncle;"
272
+ "greatgrandparent-greatgrandchild;"
273
+ "greatgrandchild-greatgrandparent;"
274
+ "grandaunt/granduncle-grandnephew/grandniece;"
275
+ "grandnephew/grandniece-grandaunt/granduncle;"
276
+ "first cousins"
277
+ ),
278
+ }
279
+
280
+ def fill_constraints(row: pd.Series) -> pd.Series:
281
+ if not row["constraints"]:
282
+ constraints = self._DEFAULT_CONSTRAINTS[row["degree"]]
283
+ return pd.Series(
284
+ {
285
+ "id1": row["id1"],
286
+ "id2": row["id2"],
287
+ "degree": row["degree"],
288
+ "constraints": constraints,
289
+ "force_constraints": row["force_constraints"],
290
+ }
291
+ )
292
+ return row
293
+
294
+ self._relation_data = self._relation_data.apply(fill_constraints, axis=1)
295
+ self._set_relation_processing_order()
296
+
297
+ def _set_relation_processing_order(self) -> None:
298
+ """
299
+ Sort relations within each degree so nodes with more kinship relations are processed first.
300
+ """
301
+ node_relation_counts = pd.concat([self._relation_data["id1"], self._relation_data["id2"]]).value_counts()
302
+
303
+ def prioritize(relations: pd.DataFrame) -> pd.DataFrame:
304
+ node1_counts = relations["id1"].map(node_relation_counts)
305
+ node2_counts = relations["id2"].map(node_relation_counts)
306
+ prioritized = relations.assign(
307
+ max_node_degree=pd.concat([node1_counts, node2_counts], axis=1).max(axis=1),
308
+ total_node_degree=node1_counts + node2_counts,
309
+ )
310
+ prioritized = prioritized.sort_values(
311
+ by=["max_node_degree", "total_node_degree"],
312
+ ascending=[False, False],
313
+ )
314
+ return prioritized.drop(columns=["max_node_degree", "total_node_degree"]).reset_index(drop=True)
315
+
316
+ self._first_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "1"])
317
+ self._second_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "2"])
318
+ self._third_degree_relations = prioritize(self._relation_data[self._relation_data["degree"] == "3"])
319
+ self._first_and_second_degree_relations = pd.concat(
320
+ [self._first_degree_relations, self._second_degree_relations]
321
+ ).reset_index(drop=True)
322
+ self._all_relations = pd.concat(
323
+ [self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
324
+ ).reset_index(drop=True)
325
+
326
+ def _validate_arguments(self) -> None:
327
+ """
328
+ Validate constructor arguments.
329
+ """
330
+ if not isinstance(self._max_candidate_pedigrees, int) or self._max_candidate_pedigrees <= 0:
331
+ raise ValueError("max_candidate_pedigrees must be a positive integer.")
332
+ if not (0 <= self._epsilon <= 1):
333
+ raise ValueError("epsilon must be between 0 and 1.")
334
+
335
+ def _shuffle_relations(self) -> None:
336
+ """
337
+ Shuffle relation DataFrames (when we want to restart the algorithm).
338
+ """
339
+ self._first_degree_relations = self._first_degree_relations.sample(
340
+ frac=1, random_state=self._rng.randint(0, 1_000_000)
341
+ ).reset_index(drop=True)
342
+ self._second_degree_relations = self._second_degree_relations.sample(
343
+ frac=1, random_state=self._rng.randint(0, 1_000_000)
344
+ ).reset_index(drop=True)
345
+ self._third_degree_relations = self._third_degree_relations.sample(
346
+ frac=1, random_state=self._rng.randint(0, 1_000_000)
347
+ ).reset_index(drop=True)
348
+ self._first_and_second_degree_relations = pd.concat(
349
+ [self._first_degree_relations, self._second_degree_relations]
350
+ ).reset_index(drop=True)
351
+ self._all_relations = pd.concat(
352
+ [self._first_degree_relations, self._second_degree_relations, self._third_degree_relations]
353
+ ).reset_index(drop=True)
354
+
355
+ def _get_initial_pedigree(self):
356
+ """
357
+ Create the initial pedigree and add all nodes.
358
+ """
359
+ initial_pedigree = Pedigree()
360
+ for (
361
+ node_id,
362
+ sex,
363
+ y_haplogroup,
364
+ mt_haplogroup,
365
+ can_have_children,
366
+ can_be_inbred,
367
+ years_before_present,
368
+ ) in self._node_data.itertuples(index=False):
369
+ initial_pedigree.add_node(
370
+ node_id, sex, y_haplogroup, mt_haplogroup, can_have_children, can_be_inbred, years_before_present
371
+ )
372
+ return initial_pedigree
373
+
374
+ def find_best_pedigree(self) -> Pedigree:
375
+ """
376
+ Finds the configuration of relations that yields the "best" pedigree.
377
+ Writes to output_dir the set of relations with the least changes from the original input data.
378
+ """
379
+ for _ in range(self._MAX_RUNS):
380
+ progress_bar = tqdm(
381
+ self._first_and_second_degree_relations.iterrows(),
382
+ total=self._first_and_second_degree_relations.shape[0],
383
+ smoothing=0.5,
384
+ bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
385
+ )
386
+ for idx, row in progress_bar:
387
+ node1, node2, degree, constraints, force_constraints = row
388
+ logger.info(f"Current relation: {node1}, {node2}, {degree}")
389
+ progress_bar.set_description(f"Processing relation {{{node1}, {node2}, {degree}}}")
390
+ self._add_relation(
391
+ node1, node2, degree=degree, constraints=constraints, force_constraints=force_constraints
392
+ )
393
+ self._clean_pedigree_data()
394
+ self._validate_pedigree_structures()
395
+
396
+ processed_relations = self._all_relations.iloc[: idx + 1]
397
+ pair_to_relations_so_far = self._get_pair_to_relations_so_far(processed_relations)
398
+ if degree == "1" and len(processed_relations) < len(self._first_and_second_degree_relations):
399
+ # Don't check for extraneous half-sibling relations because
400
+ # the 2 non-shared parents might be "merged" later
401
+ self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=False)
402
+ else:
403
+ self._prune_pedigrees(pair_to_relations_so_far, check_half_siblings=True)
404
+ logger.info(
405
+ f"Remaining pedigrees after pruning: {len(self._candidate_pedigrees)}"
406
+ "\t\tElapsed: "
407
+ f"{round(time.time() - self._start_time, 1)} s\n"
408
+ )
409
+
410
+ if not self._final_pedigrees:
411
+ logger.warning("No valid pedigree found. Shuffling relations and restarting algorithm.\n")
412
+ self._candidate_pedigrees = [self._get_initial_pedigree()]
413
+ self._shuffle_relations()
414
+ else:
415
+ break
416
+
417
+ if not self._final_pedigrees:
418
+ logger.error(f"No valid pedigree found after {self._MAX_RUNS} runs. Exiting.")
419
+ raise RuntimeError(f"No valid pedigree found after {self._MAX_RUNS} runs.")
420
+
421
+ self._clean_pedigree_data()
422
+ # Plot and write outputs of sample pedigree
423
+ sample_idx = self._rng.randint(0, len(self._final_pedigrees) - 1)
424
+ self._sample_pedigree = self._final_pedigrees[sample_idx]
425
+ self._sample_strike_count = self._final_strike_counts[sample_idx]
426
+ self._sample_strike_log = self._final_strike_logs[sample_idx]
427
+ self._sample_third_degree_strike_count = self._sample_pedigree.count_third_degree_inconsistencies(
428
+ self._pair_to_constraints
429
+ )
430
+ logger.info(
431
+ "Final pedigree strike counts — 1st/2nd degree: %s, 3rd degree: %s",
432
+ self._sample_strike_count,
433
+ self._sample_third_degree_strike_count,
434
+ )
435
+ self._write_corrected_input_relations(
436
+ self._sample_strike_count,
437
+ self._sample_strike_log,
438
+ self._outputs_dir / "corrected_input_relations.csv",
439
+ )
440
+ self._sample_pedigree.write_exact_relations(self._outputs_dir / "reconstructed_exact_relations.csv")
441
+ if self._plot:
442
+ try:
443
+ self._sample_pedigree.plot(
444
+ self._outputs_dir / "reconstructed_pedigree.pdf", plot_haplogroups=self._plot_haplogroups
445
+ )
446
+ pygraphviz_found = True
447
+ except ImportError:
448
+ logger.warning(
449
+ "PyGraphviz (https://pygraphviz.github.io/) must be installed to plot pedigrees. "
450
+ "Skipping plotting of reconstructed pedigree(s)."
451
+ )
452
+ pygraphviz_found = False
453
+
454
+ # Plot and write outputs of alternate pedigrees
455
+ if self._write_alternate_pedigrees:
456
+ alternate_dir = self._outputs_dir / "alternate_pedigrees"
457
+ alternate_dir.mkdir(parents=True, exist_ok=True)
458
+ for idx, (pedigree, strike_count, strike_log) in enumerate(
459
+ zip(self._final_pedigrees, self._final_strike_counts, self._final_strike_logs, strict=True)
460
+ ):
461
+ # Skip sample pedigree since it is already written
462
+ if idx == sample_idx:
463
+ continue
464
+
465
+ self._write_corrected_input_relations(
466
+ strike_count,
467
+ strike_log,
468
+ alternate_dir / f"pedigree_{idx}_corrected_input_relations.csv",
469
+ )
470
+ pedigree.write_exact_relations(alternate_dir / f"pedigree_{idx}_exact_relations.csv")
471
+ if self._plot and pygraphviz_found:
472
+ pedigree.plot(alternate_dir / f"pedigree_{idx}.png", plot_haplogroups=self._plot_haplogroups)
473
+ self._write_constant_relations(alternate_dir / "constant_relations.csv")
474
+
475
+ return self._sample_pedigree
476
+
477
+ @staticmethod
478
+ def _check_haplogroups(haplogroup1: str, haplogroup2: str) -> bool:
479
+ """
480
+ Checks if two haplogroups are compatible. Same semantics as pedigree.validate_haplogroups().
481
+ "*" is wild card character.
482
+ """
483
+ if not haplogroup1 or not haplogroup2: # empty OK
484
+ return True
485
+ haplogroup1_stripped, haplogroup2_stripped = haplogroup1.rstrip("*"), haplogroup2.rstrip("*")
486
+ return haplogroup1_stripped.startswith(haplogroup2_stripped) or haplogroup2_stripped.startswith(
487
+ haplogroup1_stripped
488
+ )
489
+
490
+ @staticmethod
491
+ def _check_parent_child_haplogroups(pedigree: Pedigree, parent: str, child: str) -> bool:
492
+ """
493
+ Checks if the haplogroups of a parent and child are compatible.
494
+ """
495
+ if pedigree.get_data(parent)["sex"] == "M" and pedigree.get_data(child)["sex"] == "M":
496
+ return PedigreeReconstructor._check_haplogroups(
497
+ pedigree.get_data(parent)["y_haplogroup"], pedigree.get_data(child)["y_haplogroup"]
498
+ )
499
+ if pedigree.get_data(parent)["sex"] == "F":
500
+ return PedigreeReconstructor._check_haplogroups(
501
+ pedigree.get_data(parent)["mt_haplogroup"], pedigree.get_data(child)["mt_haplogroup"]
502
+ )
503
+ return True
504
+
505
+ @staticmethod
506
+ def _check_sibling_haplogroups(pedigree: Pedigree, sibling1: str, sibling2: str) -> bool:
507
+ """
508
+ Checks if the haplogroups of two full siblings are compatible.
509
+ """
510
+ if pedigree.get_data(sibling1)["sex"] == "M" and pedigree.get_data(sibling2)["sex"] == "M":
511
+ # MT haplogroups still need to agree as well
512
+ if not PedigreeReconstructor._check_haplogroups(
513
+ pedigree.get_data(sibling1)["y_haplogroup"], pedigree.get_data(sibling2)["y_haplogroup"]
514
+ ):
515
+ return False
516
+ # All full siblings should share MT haplogroups
517
+ return PedigreeReconstructor._check_haplogroups(
518
+ pedigree.get_data(sibling1)["mt_haplogroup"], pedigree.get_data(sibling2)["mt_haplogroup"]
519
+ )
520
+
521
+ @staticmethod
522
+ def _check_aunt_uncle_nephew_niece_haplogroups(
523
+ pedigree: Pedigree, aunt_uncle: str, nephew_niece: str, shared_relative_sex: str | None
524
+ ) -> bool:
525
+ """
526
+ Checks if the haplogroups of an aunt/uncle and nephew/niece are compatible.
527
+ """
528
+ if not shared_relative_sex:
529
+ return True
530
+
531
+ if (
532
+ shared_relative_sex == "M"
533
+ and pedigree.get_data(aunt_uncle)["sex"] == "M"
534
+ and pedigree.get_data(nephew_niece)["sex"] == "M"
535
+ ):
536
+ return PedigreeReconstructor._check_haplogroups(
537
+ pedigree.get_data(aunt_uncle)["y_haplogroup"], pedigree.get_data(nephew_niece)["y_haplogroup"]
538
+ )
539
+ if shared_relative_sex == "F":
540
+ return PedigreeReconstructor._check_haplogroups(
541
+ pedigree.get_data(aunt_uncle)["mt_haplogroup"], pedigree.get_data(nephew_niece)["mt_haplogroup"]
542
+ )
543
+ return True
544
+
545
+ @staticmethod
546
+ def _check_grandparent_grandchild_haplogroups(
547
+ pedigree: Pedigree, grandparent: str, grandchild: str, shared_relative_sex: str | None
548
+ ) -> bool:
549
+ """
550
+ Checks if the haplogroups of a grandparent and grandchild are compatible.
551
+ """
552
+ if not shared_relative_sex:
553
+ return True
554
+
555
+ if (
556
+ shared_relative_sex == "M"
557
+ and pedigree.get_data(grandparent)["sex"] == "M"
558
+ and pedigree.get_data(grandchild)["sex"] == "M"
559
+ ):
560
+ return PedigreeReconstructor._check_haplogroups(
561
+ pedigree.get_data(grandparent)["y_haplogroup"], pedigree.get_data(grandchild)["y_haplogroup"]
562
+ )
563
+ if shared_relative_sex == "F" and pedigree.get_data(grandparent)["sex"] == "F":
564
+ return PedigreeReconstructor._check_haplogroups(
565
+ pedigree.get_data(grandparent)["mt_haplogroup"], pedigree.get_data(grandchild)["mt_haplogroup"]
566
+ )
567
+ return True
568
+
569
+ @staticmethod
570
+ def _check_half_sibling_haplogroups(
571
+ pedigree: Pedigree, half_sibling1: str, half_sibling2: str, shared_relative_sex: str | None
572
+ ) -> bool:
573
+ """
574
+ Checks if the haplogroups of two half-siblings are compatible.
575
+ """
576
+ if (
577
+ shared_relative_sex == "M"
578
+ and pedigree.get_data(half_sibling1)["sex"] == "M"
579
+ and pedigree.get_data(half_sibling2)["sex"] == "M"
580
+ ):
581
+ return PedigreeReconstructor._check_haplogroups(
582
+ pedigree.get_data(half_sibling1)["y_haplogroup"], pedigree.get_data(half_sibling2)["y_haplogroup"]
583
+ )
584
+ if shared_relative_sex == "F":
585
+ return PedigreeReconstructor._check_haplogroups(
586
+ pedigree.get_data(half_sibling1)["mt_haplogroup"], pedigree.get_data(half_sibling2)["mt_haplogroup"]
587
+ )
588
+ return True
589
+
590
+ def _add_relation(self, node1: str, node2: str, degree: str, constraints: str, force_constraints: bool) -> None:
591
+ """
592
+ Connects two nodes in every pedigree.
593
+ """
594
+ assert degree in ["1", "2"]
595
+
596
+ new_pedigrees: list[Pedigree] = []
597
+ for pedigree in self._candidate_pedigrees:
598
+ if degree == "1":
599
+ if not force_constraints:
600
+ new_pedigrees.extend(
601
+ PedigreeReconstructor._connect_first_degree_relation(
602
+ pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["1"]
603
+ )
604
+ )
605
+ new_pedigrees.extend(
606
+ PedigreeReconstructor._connect_second_degree_relation(
607
+ pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["2"]
608
+ )
609
+ )
610
+ else:
611
+ new_pedigrees.extend(
612
+ PedigreeReconstructor._connect_first_degree_relation(
613
+ pedigree, node1, node2, constraints=constraints
614
+ )
615
+ )
616
+
617
+ elif degree == "2":
618
+ if not force_constraints:
619
+ new_pedigrees.append(pedigree) # No relation (i.e. false positive)
620
+ new_pedigrees.extend(
621
+ PedigreeReconstructor._connect_first_degree_relation(
622
+ pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["1"]
623
+ )
624
+ )
625
+ new_pedigrees.extend(
626
+ PedigreeReconstructor._connect_second_degree_relation(
627
+ pedigree, node1, node2, constraints=self._DEFAULT_CONSTRAINTS["2"]
628
+ )
629
+ )
630
+ else:
631
+ new_pedigrees.extend(
632
+ PedigreeReconstructor._connect_second_degree_relation(
633
+ pedigree, node1, node2, constraints=constraints
634
+ )
635
+ )
636
+ self._candidate_pedigrees = new_pedigrees
637
+
638
+ @staticmethod
639
+ def _connect_first_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
640
+ """
641
+ Update pedigree with a first-degree relation.
642
+ """
643
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
644
+
645
+ new_pedigrees: list[Pedigree] = []
646
+ possible_relations: list[str] = constraints.split(";")
647
+
648
+ for relation in possible_relations:
649
+ if relation == "parent-child":
650
+ new_pedigrees.extend(PedigreeReconstructor._connect_parent_relation(pedigree, node1, node2))
651
+ if relation == "child-parent":
652
+ new_pedigrees.extend(PedigreeReconstructor._connect_parent_relation(pedigree, node2, node1))
653
+ if relation == "siblings":
654
+ new_pedigrees.extend(PedigreeReconstructor._connect_sibling_relation(pedigree, node1, node2))
655
+ return new_pedigrees
656
+
657
+ @staticmethod
658
+ def _connect_second_degree_relation(pedigree: Pedigree, node1: str, node2: str, constraints: str) -> list[Pedigree]:
659
+ """
660
+ Update pedigree with a second-degree relation.
661
+ """
662
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
663
+
664
+ new_pedigrees: list[Pedigree] = []
665
+ possible_relations: list[str] = constraints.split(";")
666
+
667
+ for relation in possible_relations:
668
+ if relation == "maternal aunt/uncle-nephew/niece":
669
+ new_pedigrees.extend(
670
+ PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node1, node2, shared_relative_sex="F")
671
+ )
672
+ if relation == "maternal nephew/niece-aunt/uncle":
673
+ new_pedigrees.extend(
674
+ PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node2, node1, shared_relative_sex="F")
675
+ )
676
+ if relation == "paternal aunt/uncle-nephew/niece":
677
+ new_pedigrees.extend(
678
+ PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node1, node2, shared_relative_sex="M")
679
+ )
680
+ if relation == "paternal nephew/niece-aunt/uncle":
681
+ new_pedigrees.extend(
682
+ PedigreeReconstructor._connect_aunt_uncle_relation(pedigree, node2, node1, shared_relative_sex="M")
683
+ )
684
+
685
+ if relation == "maternal grandparent-grandchild":
686
+ new_pedigrees.extend(
687
+ PedigreeReconstructor._connect_grandparent_relation(pedigree, node1, node2, shared_relative_sex="F")
688
+ )
689
+ if relation == "maternal grandchild-grandparent":
690
+ new_pedigrees.extend(
691
+ PedigreeReconstructor._connect_grandparent_relation(pedigree, node2, node1, shared_relative_sex="F")
692
+ )
693
+ if relation == "paternal grandparent-grandchild":
694
+ new_pedigrees.extend(
695
+ PedigreeReconstructor._connect_grandparent_relation(pedigree, node1, node2, shared_relative_sex="M")
696
+ )
697
+ if relation == "paternal grandchild-grandparent":
698
+ new_pedigrees.extend(
699
+ PedigreeReconstructor._connect_grandparent_relation(pedigree, node2, node1, shared_relative_sex="M")
700
+ )
701
+
702
+ if relation == "maternal half-siblings":
703
+ new_pedigrees.extend(
704
+ PedigreeReconstructor._connect_half_sibling_relation(
705
+ pedigree, node1, node2, shared_relative_sex="F"
706
+ )
707
+ )
708
+ if relation == "paternal half-siblings":
709
+ new_pedigrees.extend(
710
+ PedigreeReconstructor._connect_half_sibling_relation(
711
+ pedigree, node1, node2, shared_relative_sex="M"
712
+ )
713
+ )
714
+ if relation == "double cousins":
715
+ new_pedigrees.extend(PedigreeReconstructor._connect_double_cousin_relation(pedigree, node1, node2))
716
+ return new_pedigrees
717
+
718
+ @staticmethod
719
+ def _connect_parent_relation(pedigree: Pedigree, node1: str, node2: str) -> list[Pedigree]:
720
+ """
721
+ Adds a parent-child relation and merges nodes appropriately.
722
+ Returns a list containing the resulting Pedigree, if successful.
723
+ """
724
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
725
+
726
+ # Pre-check invalid relations to avoid unnecessary deep-copying
727
+ if not PedigreeReconstructor._check_parent_child_haplogroups(pedigree, node1, node2):
728
+ return []
729
+
730
+ ret: list[Pedigree] = []
731
+ new_pedigree = copy.deepcopy(pedigree)
732
+ new_pedigree.fill_node_parents(node2)
733
+ original_parent: str
734
+ if new_pedigree.get_data(node1)["sex"] == "M":
735
+ original_parent = new_pedigree.get_father(node2)
736
+ else:
737
+ original_parent = new_pedigree.get_mother(node2)
738
+
739
+ if new_pedigree.check_valid_merge(node1, original_parent):
740
+ if new_pedigree.merge_nodes(node1, original_parent):
741
+ ret.append(new_pedigree)
742
+ return ret
743
+
744
+ @staticmethod
745
+ def _connect_sibling_relation(pedigree: Pedigree, node1: str, node2: str) -> list[Pedigree]:
746
+ """
747
+ Adds a sibling relation and merges nodes appropriately.
748
+ Returns a list containing the resulting Pedigree, if successful.
749
+ """
750
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
751
+
752
+ # Pre-check invalid relations to avoid unnecessary deep-copying
753
+ if not PedigreeReconstructor._check_sibling_haplogroups(pedigree, node1, node2):
754
+ return []
755
+
756
+ ret: list[Pedigree] = []
757
+ new_pedigree = copy.deepcopy(pedigree)
758
+ new_pedigree.fill_node_parents(node1)
759
+ new_pedigree.fill_node_parents(node2)
760
+
761
+ father1 = new_pedigree.get_father(node1)
762
+ father2 = new_pedigree.get_father(node2)
763
+ if new_pedigree.check_valid_merge(father1, father2):
764
+ if new_pedigree.merge_nodes(father1, father2):
765
+ mother1 = new_pedigree.get_mother(node1)
766
+ mother2 = new_pedigree.get_mother(node2)
767
+ if new_pedigree.check_valid_merge(mother1, mother2):
768
+ if new_pedigree.merge_nodes(mother1, mother2):
769
+ new_pedigree.add_sibling_relation(node1, node2)
770
+ ret.append(new_pedigree)
771
+ return ret
772
+
773
+ @staticmethod
774
+ def _connect_aunt_uncle_relation(
775
+ pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
776
+ ) -> list[Pedigree]:
777
+ """
778
+ Adds an aunt/uncle-nephew/niece relation and merges nodes appropriately.
779
+ Returns a list containing the resulting Pedigree(s), if successful.
780
+ """
781
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
782
+ assert shared_relative_sex in ["M", "F", None]
783
+
784
+ # Pre-check invalid relations to avoid unnecessary deep-copying
785
+ if not PedigreeReconstructor._check_aunt_uncle_nephew_niece_haplogroups(
786
+ pedigree, node1, node2, shared_relative_sex
787
+ ):
788
+ return []
789
+
790
+ ret: list[Pedigree] = []
791
+ new_pedigree = copy.deepcopy(pedigree)
792
+ new_pedigree.fill_node_parents(node2)
793
+
794
+ node2_parents: list[str]
795
+ if shared_relative_sex == "M":
796
+ node2_parents = [new_pedigree.get_father(node2)]
797
+ elif shared_relative_sex == "F":
798
+ node2_parents = [new_pedigree.get_mother(node2)]
799
+ else:
800
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
801
+
802
+ for node2_parent in node2_parents:
803
+ if node1 != node2_parent:
804
+ ret.extend(PedigreeReconstructor._connect_sibling_relation(new_pedigree, node1, node2_parent))
805
+ return ret
806
+
807
+ @staticmethod
808
+ def _connect_grandparent_relation(
809
+ pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
810
+ ) -> list[Pedigree]:
811
+ """
812
+ Adds a grandparent-grandchild relation and merges nodes appropriately.
813
+ Returns a list containing the resulting Pedigree(s), if successful.
814
+ """
815
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
816
+ assert shared_relative_sex in ["M", "F", None]
817
+
818
+ # Pre-check invalid relations to avoid unnecessary deep-copying
819
+ if not PedigreeReconstructor._check_grandparent_grandchild_haplogroups(
820
+ pedigree, node1, node2, shared_relative_sex
821
+ ):
822
+ return []
823
+
824
+ ret: list[Pedigree] = []
825
+ new_pedigree = copy.deepcopy(pedigree)
826
+ new_pedigree.fill_node_parents(node2)
827
+
828
+ node2_parents: list[str]
829
+ if shared_relative_sex == "M":
830
+ node2_parents = [new_pedigree.get_father(node2)]
831
+ elif shared_relative_sex == "F":
832
+ node2_parents = [new_pedigree.get_mother(node2)]
833
+ else:
834
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
835
+
836
+ for node2_parent in node2_parents:
837
+ if node1 != node2_parent:
838
+ ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node1, node2_parent))
839
+ return ret
840
+
841
+ @staticmethod
842
+ def _connect_half_sibling_relation(
843
+ pedigree: Pedigree, node1: str, node2: str, shared_relative_sex: str | None = None
844
+ ) -> list[Pedigree]:
845
+ """
846
+ Adds a half-sibling relation and merges nodes appropriately.
847
+ Returns a list containing the resulting Pedigree(s), if successful.
848
+ """
849
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
850
+
851
+ # Pre-check invalid relations to avoid unnecessary deep-copying
852
+ if not PedigreeReconstructor._check_half_sibling_haplogroups(pedigree, node1, node2, shared_relative_sex):
853
+ return []
854
+
855
+ ret: list[Pedigree] = []
856
+ new_pedigree = copy.deepcopy(pedigree)
857
+ new_pedigree.fill_node_parents(node1)
858
+ new_pedigree.fill_node_parents(node2)
859
+
860
+ node1_parents: list[str]
861
+ node2_parents: list[str]
862
+ if shared_relative_sex == "M":
863
+ node1_parents = [new_pedigree.get_father(node1)]
864
+ node2_parents = [new_pedigree.get_father(node2)]
865
+ elif shared_relative_sex == "F":
866
+ node1_parents = [new_pedigree.get_mother(node1)]
867
+ node2_parents = [new_pedigree.get_mother(node2)]
868
+ else:
869
+ node1_parents = [new_pedigree.get_father(node1), new_pedigree.get_mother(node1)]
870
+ node2_parents = [new_pedigree.get_father(node2), new_pedigree.get_mother(node2)]
871
+
872
+ # Node 1 and Node 2 are half-siblings via one of Node 1's parents
873
+ for node1_parent in node1_parents:
874
+ if node1_parent != node2:
875
+ ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node1_parent, node2))
876
+ # Node 1 and Node 2 are half-siblings via one of Node 2's parents
877
+ for node2_parent in node2_parents:
878
+ if node2_parent != node1:
879
+ ret.extend(PedigreeReconstructor._connect_parent_relation(new_pedigree, node2_parent, node1))
880
+ return ret
881
+
882
+ @staticmethod
883
+ def _connect_double_cousin_relation(
884
+ pedigree: Pedigree, node1: str, node2: str, same_sex_siblings: bool | None = None
885
+ ) -> list[Pedigree]:
886
+ """
887
+ Adds a double (first) cousin relation and merges nodes appropriately.
888
+ Returns a list containing the resulting Pedigree(s), if successful.
889
+ """
890
+ assert node1 in pedigree.node_to_data and node2 in pedigree.node_to_data
891
+
892
+ ret: list[Pedigree] = []
893
+ new_pedigree = copy.deepcopy(pedigree)
894
+ new_pedigree.fill_node_parents(node1)
895
+ new_pedigree.fill_node_parents(node2)
896
+
897
+ if same_sex_siblings is None or same_sex_siblings:
898
+ father1 = new_pedigree.get_father(node1)
899
+ father2 = new_pedigree.get_father(node2)
900
+ if father1 != father2:
901
+ temp_same_sex_pedigrees = PedigreeReconstructor._connect_sibling_relation(
902
+ new_pedigree, father1, father2
903
+ )
904
+ for same_sex_pedigree in temp_same_sex_pedigrees:
905
+ # Get parents again in case they changed during previous merge
906
+ mother1 = same_sex_pedigree.get_mother(node1)
907
+ mother2 = same_sex_pedigree.get_mother(node2)
908
+ if mother1 != mother2:
909
+ ret.extend(PedigreeReconstructor._connect_sibling_relation(same_sex_pedigree, mother1, mother2))
910
+
911
+ if same_sex_siblings is None or not same_sex_siblings:
912
+ father1 = new_pedigree.get_father(node1)
913
+ mother2 = new_pedigree.get_mother(node2)
914
+ temp_opposite_sex_pedigrees = PedigreeReconstructor._connect_sibling_relation(
915
+ new_pedigree, father1, mother2
916
+ )
917
+ for opposite_sex_pedigree in temp_opposite_sex_pedigrees:
918
+ # Get parents again in case they changed during previous merge
919
+ mother1 = opposite_sex_pedigree.get_mother(node1)
920
+ father2 = opposite_sex_pedigree.get_father(node2)
921
+ ret.extend(PedigreeReconstructor._connect_sibling_relation(opposite_sex_pedigree, mother1, father2))
922
+ return ret
923
+
924
+ def _clean_pedigree_data(self) -> None:
925
+ """
926
+ Remove unnecessary entries in Pedigree dicts.
927
+ """
928
+ for pedigree in self._candidate_pedigrees:
929
+ pedigree.clean_data()
930
+
931
+ for pedigree in self._final_pedigrees:
932
+ pedigree.clean_data()
933
+
934
+ def _validate_pedigree_structures(self) -> None:
935
+ """
936
+ Validate that all candidate pedigrees are consistent.
937
+ """
938
+ for pedigree in self._candidate_pedigrees:
939
+ assert pedigree.validate_structure()
940
+
941
+ def _get_pair_to_constraints(self) -> defaultdict[tuple[str, str], list[tuple[str, ...]]]:
942
+ """
943
+ Turn DataFrame of relations/constraints into dict(s) of {node pairs: list of possible relations}.
944
+ Dict values are lists of tuples (as opposed to just tuples)
945
+ because a pair of nodes can share more than 1 relation.
946
+ """
947
+ pair_to_constraints: defaultdict[tuple[str, str], list[tuple[str, ...]]] = defaultdict(list)
948
+ for node1, node2, _, constraints, _ in self._all_relations.itertuples(index=False):
949
+ pair_to_constraints[(node1, node2)].append(tuple(constraints.split(";")))
950
+ for node_pair in pair_to_constraints:
951
+ # Sort by number of constraints so specific constraints are checked first when pruning
952
+ pair_to_constraints[node_pair].sort(key=lambda x: len(x))
953
+ return pair_to_constraints
954
+
955
+ def _get_pair_to_relations_so_far(
956
+ self, processed_relations: pd.DataFrame
957
+ ) -> defaultdict[tuple[str, str], list[tuple[str, str, bool]]]:
958
+ """
959
+ Turn DataFrame of relations/constraints processed so far
960
+ into dict(s) of {node pairs: list of (degree, constraints) tuples}.
961
+ """
962
+ pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]] = defaultdict(list)
963
+ for node1, node2, degree, constraints, force_constraints in processed_relations.itertuples(index=False):
964
+ pair_to_relations_so_far[(node1, node2)].append((degree, constraints, force_constraints))
965
+ return pair_to_relations_so_far
966
+
967
+ def _prune_pedigrees(
968
+ self,
969
+ pair_to_relations_so_far: defaultdict[tuple[str, str], list[tuple[str, str, bool]]],
970
+ check_half_siblings: bool,
971
+ ) -> None:
972
+ """
973
+ Remove pedigrees with inconsistencies.
974
+ """
975
+ seen_topologies = set()
976
+ new_potential_pedigrees = []
977
+ for pedigree in self._candidate_pedigrees:
978
+ if (
979
+ pedigree.validate_members(set(self._node_data["id"]))
980
+ and pedigree.validate_can_have_children()
981
+ and pedigree.validate_inbreeding()
982
+ and pedigree.validate_years_before_present()
983
+ and pedigree.validate_forced_constraints(pair_to_relations_so_far)
984
+ ):
985
+ pedigree.update_haplogroups()
986
+ if pedigree.validate_haplogroups():
987
+ topology = pedigree.get_topo_sort()
988
+ if topology not in seen_topologies:
989
+ new_potential_pedigrees.append(pedigree)
990
+ seen_topologies.add(topology)
991
+ # Shuffle to avoid ordering bias in epsilon-greedy sampling
992
+ self._rng.shuffle(new_potential_pedigrees)
993
+
994
+ strikes = []
995
+ third_degree_strikes = []
996
+ counts: defaultdict[int, int] = defaultdict(int)
997
+ for pedigree in new_potential_pedigrees:
998
+ num_strikes, _ = pedigree.count_inconsistencies(
999
+ self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings
1000
+ )
1001
+ num_third_degree_strikes = pedigree.count_third_degree_inconsistencies(self._pair_to_constraints)
1002
+ strikes.append(num_strikes)
1003
+ third_degree_strikes.append(num_third_degree_strikes)
1004
+ counts[num_strikes] += 1
1005
+ logger.info(f"Strike counts before pruning: {str(dict(sorted(counts.items())))}")
1006
+
1007
+ def epsilon_greedy_sample(
1008
+ pedigrees: list[Pedigree],
1009
+ strikes: list[int],
1010
+ third_degree_strikes: list[int],
1011
+ epsilon: float,
1012
+ max_candidate_pedigrees: int,
1013
+ ) -> list[Pedigree]:
1014
+ assert len(pedigrees) == len(strikes)
1015
+ if len(pedigrees) <= max_candidate_pedigrees:
1016
+ return pedigrees
1017
+
1018
+ sorted_pedigrees = [
1019
+ pedigree
1020
+ for pedigree, _, _ in sorted(
1021
+ zip(pedigrees, strikes, third_degree_strikes, strict=True), key=lambda x: (x[1], x[2])
1022
+ )
1023
+ ]
1024
+ exploitation_max_candidate_pedigrees = int((1 - epsilon) * max_candidate_pedigrees)
1025
+ exploration_max_candidate_pedigrees = max_candidate_pedigrees - exploitation_max_candidate_pedigrees
1026
+
1027
+ exploitation_pedigrees = sorted_pedigrees[:exploitation_max_candidate_pedigrees]
1028
+ exploration_pedigrees = self._rng.sample(
1029
+ sorted_pedigrees[exploitation_max_candidate_pedigrees:], exploration_max_candidate_pedigrees
1030
+ )
1031
+ return exploitation_pedigrees + exploration_pedigrees
1032
+
1033
+ num_processed_relations = sum(len(relations) for relations in pair_to_relations_so_far.values())
1034
+ if num_processed_relations < len(self._first_and_second_degree_relations):
1035
+ self._candidate_pedigrees = epsilon_greedy_sample(
1036
+ new_potential_pedigrees,
1037
+ strikes,
1038
+ third_degree_strikes,
1039
+ epsilon=self._epsilon,
1040
+ max_candidate_pedigrees=self._max_candidate_pedigrees,
1041
+ )
1042
+ else:
1043
+ # Final iteration
1044
+ best_pedigrees = [
1045
+ pedigree
1046
+ for pedigree, num_strikes in zip(new_potential_pedigrees, strikes, strict=True)
1047
+ if num_strikes == min(strikes)
1048
+ ]
1049
+ # Use 3rd-degree strikes as tiebreaker
1050
+ third_degree_strikes = [
1051
+ pedigree.count_third_degree_inconsistencies(self._pair_to_constraints) for pedigree in best_pedigrees
1052
+ ]
1053
+
1054
+ self._final_pedigrees.extend(
1055
+ [
1056
+ pedigree
1057
+ for pedigree, num_strikes in zip(best_pedigrees, third_degree_strikes, strict=True)
1058
+ if num_strikes == min(third_degree_strikes)
1059
+ ]
1060
+ )
1061
+ self._final_strike_counts = []
1062
+ self._final_strike_logs = []
1063
+ for pedigree in self._final_pedigrees:
1064
+ strike_count, strike_log = pedigree.count_inconsistencies(
1065
+ self._pair_to_constraints, pair_to_relations_so_far, check_half_siblings=True
1066
+ )
1067
+ self._final_strike_counts.append(strike_count)
1068
+ self._final_strike_logs.append(strike_log)
1069
+
1070
+ def _write_corrected_input_relations(
1071
+ self, strike_count: int, strike_log: list[tuple[str, str, str, str]], path: Path | str
1072
+ ) -> None:
1073
+ """
1074
+ Write corrected input relations to file. Includes information about added/removed/changed input relations.
1075
+ """
1076
+ path = Path(path)
1077
+ added_relations = []
1078
+ removed_relations = []
1079
+ for node1, node2, degree, constraints in strike_log:
1080
+ if degree[0] == "+":
1081
+ added_relations.append((node1, node2, degree[1], constraints))
1082
+ else:
1083
+ removed_relations.append((node1, node2, degree[1], constraints))
1084
+ removed_relations_set = set(removed_relations)
1085
+
1086
+ # Separate out *changed* relations (added relation + removed relation pair, e.g., 1st-degree -> 2nd-degree)
1087
+ changed_node_pairs = set()
1088
+ for add_node1, add_node2, _, _ in added_relations:
1089
+ for remove_node1, remove_node2, _, _ in removed_relations:
1090
+ if (add_node1 == remove_node1 and add_node2 == remove_node2) or (
1091
+ add_node2 == remove_node1 and add_node1 == remove_node2
1092
+ ):
1093
+ # Removed pairs follow the input pair order and will be written to file in that order, so use those
1094
+ # Then, we can sort changed_node_pairs on the tuple order that will actually be written to file
1095
+ changed_node_pairs.add((remove_node1, remove_node2))
1096
+
1097
+ with path.open("w") as file:
1098
+ file.write("id1,id2,degree,constraints\n") # Header line
1099
+ file.write(f"# Final inconsistency count: {strike_count}\n")
1100
+ file.write(
1101
+ "# Note: 3rd-degree relations are not explicitly reconstructed and will not appear as modified here\n"
1102
+ )
1103
+
1104
+ def write_relations_line(node1, node2, degree, constraints, commented=False):
1105
+ if constraints == self._DEFAULT_CONSTRAINTS[degree]:
1106
+ # Don't write default constraints to file
1107
+ constraints = ""
1108
+ if commented:
1109
+ file.write("# ")
1110
+ file.write(f"{node1},{node2},{degree},{constraints}\n")
1111
+
1112
+ file.write("# Added relations\n")
1113
+ # Sort for consistency
1114
+ for node1, node2, degree, constraints in sorted(added_relations):
1115
+ if (node1, node2) not in changed_node_pairs and (node2, node1) not in changed_node_pairs:
1116
+ write_relations_line(node1, node2, degree, constraints)
1117
+
1118
+ file.write("\n# Removed relations\n")
1119
+ for node1, node2, degree, constraints in sorted(removed_relations):
1120
+ if (node1, node2) not in changed_node_pairs and (node2, node1) not in changed_node_pairs:
1121
+ write_relations_line(node1, node2, degree, constraints, commented=True)
1122
+
1123
+ file.write("\n# Changed relations\n")
1124
+ # Pair up changed relations (add + remove)
1125
+ for node1, node2 in sorted(changed_node_pairs):
1126
+ # We want to write the two nodes in the correct (original) order
1127
+ node1_to_write = None
1128
+ node2_to_write = None
1129
+ for node1_remove, node2_remove, degree_remove, constraints_remove in removed_relations:
1130
+ if (node1_remove, node2_remove) == (node1, node2) or (node2_remove, node1_remove) == (node1, node2):
1131
+ write_relations_line(
1132
+ node1_remove, node2_remove, degree_remove, constraints_remove, commented=True
1133
+ )
1134
+ # The removed nodes follow the original input order
1135
+ node1_to_write = node1_remove
1136
+ node2_to_write = node2_remove
1137
+ for node1_add, node2_add, degree_add, constraints_add in added_relations:
1138
+ if (node1_add, node2_add) == (node1, node2) or (node2_add, node1_add) == (node1, node2):
1139
+ assert node1_to_write and node2_to_write
1140
+ write_relations_line(node1_to_write, node2_to_write, degree_add, constraints_add)
1141
+
1142
+ file.write("\n# Unchanged relations\n")
1143
+ for node1, node2, degree, constraints, _ in self._all_relations.itertuples(index=False):
1144
+ if (node1, node2, degree, constraints) not in removed_relations_set:
1145
+ assert (node2, node1, degree, constraints) not in removed_relations_set
1146
+ write_relations_line(node1, node2, degree, constraints)
1147
+
1148
+ def _write_constant_relations(self, path: Path | str) -> None:
1149
+ """
1150
+ Write relations that are identical across all final pedigrees.
1151
+ """
1152
+ path = Path(path)
1153
+ node_sets = [pedigree.get_non_placeholder_nodes() for pedigree in self._final_pedigrees]
1154
+ nodes = next(iter(node_sets))
1155
+ assert all(node_set == nodes for node_set in node_sets)
1156
+
1157
+ nodes = sorted(nodes)
1158
+ with path.open("w") as file:
1159
+ file.write("# Constant kinship relations across all output pedigrees\n")
1160
+ file.write("node1,node2,relation\n")
1161
+ for i in range(len(nodes)):
1162
+ node1 = nodes[i]
1163
+ for j in range(i + 1, len(nodes)):
1164
+ node2 = nodes[j]
1165
+ shared_relations: dict[str, int] | None = None
1166
+ for pedigree in self._final_pedigrees:
1167
+ relations = pedigree.get_relations_between_nodes(node1, node2, include_maternal_paternal=True)
1168
+ if shared_relations is None:
1169
+ shared_relations = dict(relations)
1170
+ else:
1171
+ to_remove = [
1172
+ relation
1173
+ for relation, count in shared_relations.items()
1174
+ if relations.get(relation, 0) != count
1175
+ ]
1176
+ for relation in to_remove:
1177
+ del shared_relations[relation]
1178
+ if not shared_relations:
1179
+ break
1180
+
1181
+ if not shared_relations:
1182
+ continue
1183
+
1184
+ for relation, count in shared_relations.items():
1185
+ for _ in range(count):
1186
+ file.write(f"{node1},{node2},{relation}\n")