repare 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of repare might be problematic. Click here for more details.

Files changed (57) hide show
  1. repare-0.0.2/.gitignore +181 -0
  2. repare-0.0.2/.pre-commit-config.yaml +9 -0
  3. repare-0.0.2/LICENSE +7 -0
  4. repare-0.0.2/PKG-INFO +35 -0
  5. repare-0.0.2/README.md +15 -0
  6. repare-0.0.2/benchmarks/published/comparator/__init__.py +0 -0
  7. repare-0.0.2/benchmarks/published/comparator/relation_comparison.py +342 -0
  8. repare-0.0.2/benchmarks/published/data/blocher/citation.bib +16 -0
  9. repare-0.0.2/benchmarks/published/data/blocher/inferred_relations_KIN.csv +115 -0
  10. repare-0.0.2/benchmarks/published/data/blocher/inferred_relations_custom.csv +115 -0
  11. repare-0.0.2/benchmarks/published/data/blocher/nodes.csv +34 -0
  12. repare-0.0.2/benchmarks/published/data/blocher/published_exact_relations.csv +83 -0
  13. repare-0.0.2/benchmarks/published/data/fowler/citation.bib +15 -0
  14. repare-0.0.2/benchmarks/published/data/fowler/inferred_relations_coeffs.csv +124 -0
  15. repare-0.0.2/benchmarks/published/data/fowler/inferred_relations_custom.csv +126 -0
  16. repare-0.0.2/benchmarks/published/data/fowler/nodes.csv +37 -0
  17. repare-0.0.2/benchmarks/published/data/fowler/published_exact_relations.csv +71 -0
  18. repare-0.0.2/benchmarks/published/data/rivollat/citation.bib +34 -0
  19. repare-0.0.2/benchmarks/published/data/rivollat/inferred_relations_READv2.csv +423 -0
  20. repare-0.0.2/benchmarks/published/data/rivollat/nodes.csv +96 -0
  21. repare-0.0.2/benchmarks/published/data/rivollat/published_exact_relations.csv +266 -0
  22. repare-0.0.2/benchmarks/published/run_comparisons.py +26 -0
  23. repare-0.0.2/benchmarks/simulations/plot_data.py +99 -0
  24. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=0.0.csv +101 -0
  25. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=0.5.csv +101 -0
  26. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=1.0.csv +101 -0
  27. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=2.csv +101 -0
  28. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=0.0.csv +101 -0
  29. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=0.5.csv +101 -0
  30. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=1.0.csv +101 -0
  31. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=2.csv +101 -0
  32. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=0.0.csv +101 -0
  33. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=0.5.csv +101 -0
  34. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=1.0.csv +101 -0
  35. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=2.csv +101 -0
  36. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=0.0.csv +101 -0
  37. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=0.5.csv +101 -0
  38. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=1.0.csv +101 -0
  39. repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=2.csv +101 -0
  40. repare-0.0.2/benchmarks/simulations/results/plots/degree_f1_heatmap.png +0 -0
  41. repare-0.0.2/benchmarks/simulations/results/plots/pedigree_summary_statistics.png +0 -0
  42. repare-0.0.2/benchmarks/simulations/results/plots/relation_f1_heatmap.png +0 -0
  43. repare-0.0.2/benchmarks/simulations/run_simulations.py +52 -0
  44. repare-0.0.2/benchmarks/simulations/simulator/__init__.py +0 -0
  45. repare-0.0.2/benchmarks/simulations/simulator/simulated_pedigree.py +612 -0
  46. repare-0.0.2/pyproject.toml +50 -0
  47. repare-0.0.2/setup.cfg +4 -0
  48. repare-0.0.2/src/repare/__init__.py +4 -0
  49. repare-0.0.2/src/repare/main.py +64 -0
  50. repare-0.0.2/src/repare/pedigree.py +1214 -0
  51. repare-0.0.2/src/repare/pedigree_reconstructor.py +874 -0
  52. repare-0.0.2/src/repare.egg-info/PKG-INFO +35 -0
  53. repare-0.0.2/src/repare.egg-info/SOURCES.txt +55 -0
  54. repare-0.0.2/src/repare.egg-info/dependency_links.txt +1 -0
  55. repare-0.0.2/src/repare.egg-info/entry_points.txt +2 -0
  56. repare-0.0.2/src/repare.egg-info/requires.txt +11 -0
  57. repare-0.0.2/src/repare.egg-info/top_level.txt +1 -0
@@ -0,0 +1,181 @@
1
+ # Data directories
2
+ *raw_data
3
+ *sandbox_data
4
+
5
+ # .DS_Store files
6
+ .DS_Store
7
+ ._.DS_Store
8
+ **/.DS_Store
9
+ **/._.DS_Store
10
+
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # Sphinx documentation
82
+ docs/_build/
83
+
84
+ # PyBuilder
85
+ .pybuilder/
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ # For a library or package, you might want to ignore these files since the code is
97
+ # intended to run in multiple environments; otherwise, check them in:
98
+ # .python-version
99
+
100
+ # pipenv
101
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
103
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
104
+ # install all needed dependencies.
105
+ #Pipfile.lock
106
+
107
+ # UV
108
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ #uv.lock
112
+
113
+ # poetry
114
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
115
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
116
+ # commonly ignored for libraries.
117
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
118
+ #poetry.lock
119
+
120
+ # pdm
121
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
122
+ #pdm.lock
123
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
124
+ # in version control.
125
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
126
+ .pdm.toml
127
+ .pdm-python
128
+ .pdm-build/
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule
135
+ celerybeat.pid
136
+
137
+ # SageMath parsed files
138
+ *.sage.py
139
+
140
+ # Environments
141
+ .env
142
+ .venv
143
+ env/
144
+ venv/
145
+ ENV/
146
+ env.bak/
147
+ venv.bak/
148
+
149
+ # Spyder project settings
150
+ .spyderproject
151
+ .spyproject
152
+
153
+ # Rope project settings
154
+ .ropeproject
155
+
156
+ # mkdocs documentation
157
+ /site
158
+
159
+ # mypy
160
+ .mypy_cache/
161
+ .dmypy.json
162
+ dmypy.json
163
+
164
+ # Pyre type checker
165
+ .pyre/
166
+
167
+ # pytype static type analyzer
168
+ .pytype/
169
+
170
+ # Cython debug symbols
171
+ cython_debug/
172
+
173
+ # PyCharm
174
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
175
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
176
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
177
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
178
+ #.idea/
179
+
180
+ # PyPI configuration file
181
+ .pypirc
@@ -0,0 +1,9 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.11.5
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff
8
+ # Run the formatter.
9
+ - id: ruff-format
repare-0.0.2/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2024 (c) Edward Huang
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
repare-0.0.2/PKG-INFO ADDED
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: repare
3
+ Version: 0.0.2
4
+ Summary: Reconstruct ancient pedigrees.
5
+ Author-email: Edward Huang <edwardhuang02@gmail.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: matplotlib
11
+ Requires-Dist: networkx
12
+ Requires-Dist: pandas
13
+ Requires-Dist: tqdm
14
+ Provides-Extra: benchmark
15
+ Requires-Dist: scikit-learn; extra == "benchmark"
16
+ Requires-Dist: seaborn; extra == "benchmark"
17
+ Provides-Extra: plot
18
+ Requires-Dist: pygraphviz; extra == "plot"
19
+ Dynamic: license-file
20
+
21
+ **repare** is a Python package for (ancient) pedigree reconstruction.
22
+
23
+ ## Installation
24
+
25
+ ### Recommended
26
+ ```
27
+ conda create -n "repare" -c conda-forge python=3.13 pygraphviz
28
+ conda activate repare
29
+ pip install repare
30
+ ```
31
+ repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
32
+
33
+ If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
34
+
35
+ To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
repare-0.0.2/README.md ADDED
@@ -0,0 +1,15 @@
1
+ **repare** is a Python package for (ancient) pedigree reconstruction.
2
+
3
+ ## Installation
4
+
5
+ ### Recommended
6
+ ```
7
+ conda create -n "repare" -c conda-forge python=3.13 pygraphviz
8
+ conda activate repare
9
+ pip install repare
10
+ ```
11
+ repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
12
+
13
+ If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
14
+
15
+ To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
@@ -0,0 +1,342 @@
1
+ import logging
2
+ import tempfile
3
+ from collections import defaultdict
4
+ from itertools import combinations
5
+
6
+ import pandas as pd
7
+ from sklearn.metrics import r2_score
8
+
9
+ from repare.pedigree import Pedigree
10
+ from repare.pedigree_reconstructor import PedigreeReconstructor
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RelationComparison:
16
+ """
17
+ Generates an algorithm-reconstructed pedigree and compares it to a published/ground-truth pedigree.
18
+ """
19
+
20
+ def __init__(self, published_relations_path: str, algorithm_nodes_path: str, algorithm_relations_path: str) -> None:
21
+ self._published_relations_path = published_relations_path
22
+ self._algorithm_nodes_path = algorithm_nodes_path
23
+ self._algorithm_relations_path = algorithm_relations_path
24
+
25
+ self._published_relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = (
26
+ self._load_published_relations(self._published_relations_path)
27
+ )
28
+ self._algorithm_relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = (
29
+ self._load_algorithm_relations(self._algorithm_nodes_path, self._algorithm_relations_path)
30
+ )
31
+ self._fill_uncertain_relations()
32
+
33
+ def _load_published_relations(self, path: str) -> defaultdict[tuple[str, str], defaultdict[str, int]]:
34
+ published_relations_df = pd.read_csv(path, comment="#")
35
+ relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
36
+ for id1, id2, relation in published_relations_df.itertuples(index=False):
37
+ id1, id2, relation = self._sort_relation(id1, id2, relation)
38
+ relation_counts[(id1, id2)][relation] += 1
39
+ return relation_counts
40
+
41
+ def _load_algorithm_relations(
42
+ self, nodes_path: str, relations_path: str
43
+ ) -> defaultdict[tuple[str, str], defaultdict[str, int]]:
44
+ self._algorithm_pedigree: Pedigree = self._run_algorithm(nodes_path, relations_path)
45
+ algorithm_relations: defaultdict[tuple[str, str], defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
46
+
47
+ for id1, id2 in combinations(self._algorithm_pedigree.node_to_data, 2):
48
+ if not id1.isnumeric() and not id2.isnumeric(): # Skip placeholder nodes
49
+ relations_between_nodes = self._algorithm_pedigree.get_relations_between_nodes(
50
+ id1, id2, include_maternal_paternal=True
51
+ )
52
+ for relation, count in relations_between_nodes.items():
53
+ id1, id2, relation = self._sort_relation(id1, id2, relation)
54
+ algorithm_relations[(id1, id2)][relation] += count
55
+ return algorithm_relations
56
+
57
+ @staticmethod
58
+ def _sort_relation(id1: str, id2: str, relation: str) -> tuple[str, str, str]:
59
+ flipped_relations = {
60
+ "parent-child": "child-parent",
61
+ "child-parent": "parent-child",
62
+ "siblings": "siblings", # Symmetric
63
+ "maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
64
+ "maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
65
+ "paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
66
+ "paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
67
+ "maternal grandparent-grandchild": "maternal grandchild-grandparent",
68
+ "maternal grandchild-grandparent": "maternal grandparent-grandchild",
69
+ "paternal grandparent-grandchild": "paternal grandchild-grandparent",
70
+ "paternal grandchild-grandparent": "paternal grandparent-grandchild",
71
+ "maternal half-siblings": "maternal half-siblings", # Symmetric
72
+ "paternal half-siblings": "paternal half-siblings", # Symmetric
73
+ "1": "1", # Symmetric
74
+ "2": "2", # Symmetric
75
+ }
76
+ if id2 < id1:
77
+ return id2, id1, flipped_relations[relation]
78
+ else:
79
+ return id1, id2, relation
80
+
81
+ @staticmethod
82
+ def _run_algorithm(nodes_path: str, relations_path: str) -> Pedigree:
83
+ with tempfile.TemporaryDirectory() as temp_dir:
84
+ pedigree_reconstructor = PedigreeReconstructor(
85
+ relations_path, nodes_path, outputs_dir=temp_dir, max_candidate_pedigrees=1000, plot=False
86
+ )
87
+ return pedigree_reconstructor.find_best_pedigree()
88
+
89
+ def _fill_uncertain_relations(self) -> None:
90
+ uncertain_to_exact_relations = {
91
+ "1": ["parent-child", "child-parent", "siblings"],
92
+ "2": [
93
+ "maternal aunt/uncle-nephew/niece",
94
+ "maternal nephew/niece-aunt/uncle",
95
+ "paternal aunt/uncle-nephew/niece",
96
+ "paternal nephew/niece-aunt/uncle",
97
+ "maternal grandparent-grandchild",
98
+ "maternal grandchild-grandparent",
99
+ "paternal grandparent-grandchild",
100
+ "paternal grandchild-grandparent",
101
+ "maternal half-siblings",
102
+ "paternal half-siblings",
103
+ ],
104
+ }
105
+
106
+ for (id1, id2), relation_counts_between_nodes in self._published_relation_counts.items():
107
+ for uncertain_relation, count in list(relation_counts_between_nodes.items()): # Cast to list to copy items
108
+ if uncertain_relation not in uncertain_to_exact_relations:
109
+ continue
110
+
111
+ for exact_relation in uncertain_to_exact_relations[uncertain_relation]:
112
+ available_count = self._algorithm_relation_counts[(id1, id2)][exact_relation]
113
+ assign_count = min(count, available_count)
114
+ self._published_relation_counts[(id1, id2)][exact_relation] += assign_count
115
+ self._published_relation_counts[(id1, id2)][uncertain_relation] -= assign_count
116
+
117
+ count -= assign_count
118
+ if count == 0:
119
+ break
120
+
121
+ def get_metrics(self) -> dict[str, float]:
122
+ metrics: dict[str, float] = dict()
123
+ pairwise_relation_accuracy, relation_precision, relation_recall, relation_f1 = (
124
+ self._calculate_relation_metrics()
125
+ )
126
+ pairwise_degree_accuracy, degree_precision, degree_recall, degree_f1 = self._calculate_degree_metrics()
127
+
128
+ metrics["Pairwise Relation Accuracy"] = pairwise_relation_accuracy
129
+ metrics["Relation Precision"] = relation_precision
130
+ metrics["Relation Recall"] = relation_recall
131
+ metrics["Relation F1"] = relation_f1
132
+ metrics["Pairwise Degree Accuracy"] = pairwise_degree_accuracy
133
+ metrics["Degree Precision"] = degree_precision
134
+ metrics["Degree Recall"] = degree_recall
135
+ metrics["Degree F1"] = degree_f1
136
+ metrics["Connectivity R-squared"] = self._calculate_connectivity_r_squared()
137
+ metrics["Kinship Inference Errors"] = self._calculate_kinship_inference_errors()
138
+ return metrics
139
+
140
+ @staticmethod
141
+ def _calculate_tp_fp_fn(
142
+ published_counts: defaultdict(int), algorithm_counts: defaultdict(int), nodes: tuple[str, str]
143
+ ) -> tuple[int, int, int]:
144
+ tp = 0 # True positives
145
+ fp = 0 # False positives
146
+ fn = 0 # False negatives
147
+ relations = published_counts.keys() | algorithm_counts.keys()
148
+ for relation in relations:
149
+ true_count = published_counts[relation]
150
+ algorithm_count = algorithm_counts[relation]
151
+
152
+ if true_count == algorithm_count:
153
+ tp += true_count
154
+ elif true_count > algorithm_count:
155
+ tp += algorithm_count
156
+ fn += true_count - algorithm_count
157
+ logger.info(f"False Negative: {nodes[0]} - {nodes[1]}: {relation} ({true_count} > {algorithm_count})")
158
+ else:
159
+ tp += true_count
160
+ fp += algorithm_count - true_count
161
+ logger.info(f"False Positive: {nodes[0]} - {nodes[1]}: {relation} ({true_count} < {algorithm_count})")
162
+ return tp, fp, fn
163
+
164
+ def _calculate_relation_metrics(self) -> tuple[float, float, float, float]:
165
+ correct_node_pairs: int = 0
166
+ total_node_pairs: int = 0
167
+ relation_tp: int = 0
168
+ relation_fp: int = 0
169
+ relation_fn: int = 0
170
+
171
+ nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
172
+ for id1, id2 in combinations(sorted(nodes), 2):
173
+ published_relations_between_nodes = self._published_relation_counts[(id1, id2)]
174
+ algorithm_relations_between_nodes = self._algorithm_relation_counts[(id1, id2)]
175
+
176
+ if published_relations_between_nodes == algorithm_relations_between_nodes:
177
+ correct_node_pairs += 1
178
+ total_node_pairs += 1
179
+
180
+ tp, fp, fn = self._calculate_tp_fp_fn(
181
+ published_relations_between_nodes, algorithm_relations_between_nodes, (id1, id2)
182
+ )
183
+ relation_tp += tp
184
+ relation_fp += fp
185
+ relation_fn += fn
186
+
187
+ pairwise_relation_accuracy = correct_node_pairs / total_node_pairs
188
+ relation_precision = relation_tp / (relation_tp + relation_fp)
189
+ relation_recall = relation_tp / (relation_tp + relation_fn)
190
+ relation_f1 = 2 * (relation_precision * relation_recall) / (relation_precision + relation_recall)
191
+ relation_f1 = (
192
+ (2 * relation_precision * relation_recall) / (relation_precision + relation_recall)
193
+ if relation_precision + relation_recall > 0
194
+ else 0
195
+ )
196
+ return pairwise_relation_accuracy, relation_precision, relation_recall, relation_f1
197
+
198
+ def _calculate_degree_metrics(self) -> tuple[float, float, float, float]:
199
+ correct_node_pairs: int = 0
200
+ total_node_pairs: int = 0
201
+ degree_tp: int = 0
202
+ degree_fp: int = 0
203
+ degree_fn: int = 0
204
+
205
+ nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
206
+ for id1, id2 in combinations(sorted(nodes), 2):
207
+ published_relations_between_nodes = self._published_relation_counts[(id1, id2)]
208
+ algorithm_relations_between_nodes = self._algorithm_relation_counts[(id1, id2)]
209
+
210
+ published_degrees_between_nodes = defaultdict(int)
211
+ algorithm_degrees_between_nodes = defaultdict(int)
212
+ for relation in ["parent-child", "child-parent", "siblings"]:
213
+ published_degrees_between_nodes["1"] += published_relations_between_nodes[relation]
214
+ algorithm_degrees_between_nodes["1"] += algorithm_relations_between_nodes[relation]
215
+
216
+ for relation in [
217
+ "maternal aunt/uncle-nephew/niece",
218
+ "paternal aunt/uncle-nephew/niece",
219
+ "maternal nephew/niece-aunt/uncle",
220
+ "paternal nephew/niece-aunt/uncle",
221
+ "maternal grandparent-grandchild",
222
+ "paternal grandparent-grandchild",
223
+ "maternal grandchild-grandparent",
224
+ "paternal grandchild-grandparent",
225
+ "maternal half-siblings",
226
+ "paternal half-siblings",
227
+ ]:
228
+ published_degrees_between_nodes["2"] += published_relations_between_nodes[relation]
229
+ algorithm_degrees_between_nodes["2"] += algorithm_relations_between_nodes[relation]
230
+
231
+ if published_degrees_between_nodes == algorithm_degrees_between_nodes:
232
+ correct_node_pairs += 1
233
+ total_node_pairs += 1
234
+
235
+ tp, fp, fn = self._calculate_tp_fp_fn(
236
+ published_degrees_between_nodes, algorithm_degrees_between_nodes, (id1, id2)
237
+ )
238
+ degree_tp += tp
239
+ degree_fp += fp
240
+ degree_fn += fn
241
+
242
+ pairwise_degree_accuracy = correct_node_pairs / total_node_pairs
243
+ degree_precision = degree_tp / (degree_tp + degree_fp)
244
+ degree_recall = degree_tp / (degree_tp + degree_fn)
245
+ degree_f1 = (
246
+ (2 * degree_precision * degree_recall) / (degree_precision + degree_recall)
247
+ if degree_precision + degree_recall > 0
248
+ else 0
249
+ )
250
+ return pairwise_degree_accuracy, degree_precision, degree_recall, degree_f1
251
+
252
+ def _calculate_connectivity_r_squared(self) -> float:
253
+ published_relation_counter: defaultdict[str, int] = defaultdict(int)
254
+ algorithm_relation_counter: defaultdict[str, int] = defaultdict(int)
255
+
256
+ nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
257
+ for node1, node2 in combinations(sorted(nodes), 2):
258
+ if not node1.isnumeric() and not node2.isnumeric():
259
+ relations_between_nodes = self._published_relation_counts[(node1, node2)]
260
+ for relation, count in relations_between_nodes.items():
261
+ published_relation_counter[relation] += count
262
+
263
+ relations_between_nodes = self._algorithm_relation_counts[(node1, node2)]
264
+ for relation, count in relations_between_nodes.items():
265
+ algorithm_relation_counter[relation] += count
266
+
267
+ published_connectivities: list[int] = []
268
+ algorithm_connectivities: list[int] = []
269
+ for node in nodes:
270
+ published_connectivities.append(published_relation_counter[node])
271
+ algorithm_connectivities.append(algorithm_relation_counter[node])
272
+ return r2_score(published_connectivities, algorithm_connectivities)
273
+
274
+ def _calculate_kinship_inference_errors(self) -> int:
275
+ """
276
+ Calculate the number of node pairs that share a different inferred kinship degree than in the published pedigree
277
+ or share a relation constraint not consistent with the published pedigree.
278
+ """
279
+ published_exact_relations = pd.read_csv(
280
+ self._published_relations_path, dtype=str, comment="#", keep_default_na=False
281
+ )
282
+ inferred_relations = pd.read_csv(self._algorithm_relations_path, dtype=str, comment="#", keep_default_na=False)
283
+
284
+ first_degree_relations = {"parent-child", "child-parent", "siblings", "1"}
285
+ second_degree_relations = {
286
+ "maternal aunt/uncle-nephew/niece",
287
+ "paternal aunt/uncle-nephew/niece",
288
+ "maternal nephew/niece-aunt/uncle",
289
+ "paternal nephew/niece-aunt/uncle",
290
+ "maternal grandparent-grandchild",
291
+ "paternal grandparent-grandchild",
292
+ "maternal grandchild-grandparent",
293
+ "paternal grandchild-grandparent",
294
+ "maternal half-siblings",
295
+ "paternal half-siblings",
296
+ "2",
297
+ }
298
+
299
+ pair_to_published_degree = {}
300
+ for id1, id2, relation in published_exact_relations.itertuples(index=False):
301
+ if relation in first_degree_relations:
302
+ degree = "1"
303
+ elif relation in second_degree_relations:
304
+ degree = "2"
305
+
306
+ if degree:
307
+ pair_to_published_degree[tuple(sorted((id1, id2)))] = degree
308
+
309
+ pair_to_inferred_degree = {}
310
+ pair_to_inferred_constraints = {}
311
+ for id1, id2, degree, constraints in inferred_relations.itertuples(index=False):
312
+ if degree == "1" or degree == "2":
313
+ pair_to_inferred_degree[tuple(sorted((id1, id2)))] = degree
314
+ if constraints:
315
+ pair_to_inferred_constraints[tuple(sorted((id1, id2)))] = set(constraints.split(";"))
316
+
317
+ # Compare the degree dicts
318
+ kinship_inference_errors = 0
319
+ for pair, algorithm_degree in pair_to_inferred_degree.items():
320
+ if pair not in pair_to_published_degree:
321
+ kinship_inference_errors += 1
322
+ continue
323
+
324
+ published_degree = pair_to_published_degree[pair]
325
+ if algorithm_degree != published_degree:
326
+ kinship_inference_errors += 1
327
+ continue
328
+
329
+ for pair in pair_to_published_degree:
330
+ if pair not in pair_to_inferred_degree:
331
+ kinship_inference_errors += 1
332
+
333
+ # Count first-degree exact relation inference errors
334
+ for id1, id2, relation in published_exact_relations.itertuples(index=False):
335
+ if relation == "1" or relation == "2": # Skip "dotted lines"
336
+ continue
337
+
338
+ pair = tuple(sorted((id1, id2)))
339
+ if pair in pair_to_inferred_constraints and relation not in pair_to_inferred_constraints[pair]:
340
+ assert relation in first_degree_relations
341
+ kinship_inference_errors += 1
342
+ return kinship_inference_errors
@@ -0,0 +1,16 @@
1
+ @article{Blöcher2023,
2
+ author = {Blöcher, Jens and Brami, Maxime and Feinauer, Isabelle Sofie
3
+ and Stolarczyk, Eliza and Diekmann, Yoan and Vetterdietz, Lisa and
4
+ Karapetian, Marina and Winkelbach, Laura and Kokot, Vanessa and
5
+ Vallini, Leonardo and Stobbe, Astrid and Haak, Wolfgang and
6
+ Papageorgopoulou, Christina and Krause, Rüdiger and Sharapova,
7
+ Svetlana and Burger, Joachim},
8
+ publisher = {Proceedings of the National Academy of Sciences},
9
+ title = {Descent, Marriage, and Residence Practices of a
10
+ 3,800-Year-Old Pastoral Community in {Central} {Eurasia}},
11
+ journal = {Proceedings of the National Academy of Sciences},
12
+ volume = {120},
13
+ number = {36},
14
+ pages = {e2303574120},
15
+ url = {https://doi.org/10.1073/pnas.2303574120}
16
+ }