repare 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of repare might be problematic. Click here for more details.
- repare-0.0.2/.gitignore +181 -0
- repare-0.0.2/.pre-commit-config.yaml +9 -0
- repare-0.0.2/LICENSE +7 -0
- repare-0.0.2/PKG-INFO +35 -0
- repare-0.0.2/README.md +15 -0
- repare-0.0.2/benchmarks/published/comparator/__init__.py +0 -0
- repare-0.0.2/benchmarks/published/comparator/relation_comparison.py +342 -0
- repare-0.0.2/benchmarks/published/data/blocher/citation.bib +16 -0
- repare-0.0.2/benchmarks/published/data/blocher/inferred_relations_KIN.csv +115 -0
- repare-0.0.2/benchmarks/published/data/blocher/inferred_relations_custom.csv +115 -0
- repare-0.0.2/benchmarks/published/data/blocher/nodes.csv +34 -0
- repare-0.0.2/benchmarks/published/data/blocher/published_exact_relations.csv +83 -0
- repare-0.0.2/benchmarks/published/data/fowler/citation.bib +15 -0
- repare-0.0.2/benchmarks/published/data/fowler/inferred_relations_coeffs.csv +124 -0
- repare-0.0.2/benchmarks/published/data/fowler/inferred_relations_custom.csv +126 -0
- repare-0.0.2/benchmarks/published/data/fowler/nodes.csv +37 -0
- repare-0.0.2/benchmarks/published/data/fowler/published_exact_relations.csv +71 -0
- repare-0.0.2/benchmarks/published/data/rivollat/citation.bib +34 -0
- repare-0.0.2/benchmarks/published/data/rivollat/inferred_relations_READv2.csv +423 -0
- repare-0.0.2/benchmarks/published/data/rivollat/nodes.csv +96 -0
- repare-0.0.2/benchmarks/published/data/rivollat/published_exact_relations.csv +266 -0
- repare-0.0.2/benchmarks/published/run_comparisons.py +26 -0
- repare-0.0.2/benchmarks/simulations/plot_data.py +99 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=0.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=0.5.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=1.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.0_error_rate_scale=2.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=0.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=0.5.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=1.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.2_error_rate_scale=2.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=0.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=0.5.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=1.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.4_error_rate_scale=2.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=0.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=0.5.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=1.0.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/data/p_mask_node=0.6_error_rate_scale=2.csv +101 -0
- repare-0.0.2/benchmarks/simulations/results/plots/degree_f1_heatmap.png +0 -0
- repare-0.0.2/benchmarks/simulations/results/plots/pedigree_summary_statistics.png +0 -0
- repare-0.0.2/benchmarks/simulations/results/plots/relation_f1_heatmap.png +0 -0
- repare-0.0.2/benchmarks/simulations/run_simulations.py +52 -0
- repare-0.0.2/benchmarks/simulations/simulator/__init__.py +0 -0
- repare-0.0.2/benchmarks/simulations/simulator/simulated_pedigree.py +612 -0
- repare-0.0.2/pyproject.toml +50 -0
- repare-0.0.2/setup.cfg +4 -0
- repare-0.0.2/src/repare/__init__.py +4 -0
- repare-0.0.2/src/repare/main.py +64 -0
- repare-0.0.2/src/repare/pedigree.py +1214 -0
- repare-0.0.2/src/repare/pedigree_reconstructor.py +874 -0
- repare-0.0.2/src/repare.egg-info/PKG-INFO +35 -0
- repare-0.0.2/src/repare.egg-info/SOURCES.txt +55 -0
- repare-0.0.2/src/repare.egg-info/dependency_links.txt +1 -0
- repare-0.0.2/src/repare.egg-info/entry_points.txt +2 -0
- repare-0.0.2/src/repare.egg-info/requires.txt +11 -0
- repare-0.0.2/src/repare.egg-info/top_level.txt +1 -0
repare-0.0.2/.gitignore
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Data directories
|
|
2
|
+
*raw_data
|
|
3
|
+
*sandbox_data
|
|
4
|
+
|
|
5
|
+
# .DS_Store files
|
|
6
|
+
.DS_Store
|
|
7
|
+
._.DS_Store
|
|
8
|
+
**/.DS_Store
|
|
9
|
+
**/._.DS_Store
|
|
10
|
+
|
|
11
|
+
# Byte-compiled / optimized / DLL files
|
|
12
|
+
__pycache__/
|
|
13
|
+
*.py[cod]
|
|
14
|
+
*$py.class
|
|
15
|
+
|
|
16
|
+
# C extensions
|
|
17
|
+
*.so
|
|
18
|
+
|
|
19
|
+
# Distribution / packaging
|
|
20
|
+
.Python
|
|
21
|
+
build/
|
|
22
|
+
develop-eggs/
|
|
23
|
+
dist/
|
|
24
|
+
downloads/
|
|
25
|
+
eggs/
|
|
26
|
+
.eggs/
|
|
27
|
+
lib/
|
|
28
|
+
lib64/
|
|
29
|
+
parts/
|
|
30
|
+
sdist/
|
|
31
|
+
var/
|
|
32
|
+
wheels/
|
|
33
|
+
share/python-wheels/
|
|
34
|
+
*.egg-info/
|
|
35
|
+
.installed.cfg
|
|
36
|
+
*.egg
|
|
37
|
+
MANIFEST
|
|
38
|
+
|
|
39
|
+
# PyInstaller
|
|
40
|
+
# Usually these files are written by a python script from a template
|
|
41
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
42
|
+
*.manifest
|
|
43
|
+
*.spec
|
|
44
|
+
|
|
45
|
+
# Installer logs
|
|
46
|
+
pip-log.txt
|
|
47
|
+
pip-delete-this-directory.txt
|
|
48
|
+
|
|
49
|
+
# Unit test / coverage reports
|
|
50
|
+
htmlcov/
|
|
51
|
+
.tox/
|
|
52
|
+
.nox/
|
|
53
|
+
.coverage
|
|
54
|
+
.coverage.*
|
|
55
|
+
.cache
|
|
56
|
+
nosetests.xml
|
|
57
|
+
coverage.xml
|
|
58
|
+
*.cover
|
|
59
|
+
*.py,cover
|
|
60
|
+
.hypothesis/
|
|
61
|
+
.pytest_cache/
|
|
62
|
+
cover/
|
|
63
|
+
|
|
64
|
+
# Translations
|
|
65
|
+
*.mo
|
|
66
|
+
*.pot
|
|
67
|
+
|
|
68
|
+
# Django stuff:
|
|
69
|
+
*.log
|
|
70
|
+
local_settings.py
|
|
71
|
+
db.sqlite3
|
|
72
|
+
db.sqlite3-journal
|
|
73
|
+
|
|
74
|
+
# Flask stuff:
|
|
75
|
+
instance/
|
|
76
|
+
.webassets-cache
|
|
77
|
+
|
|
78
|
+
# Scrapy stuff:
|
|
79
|
+
.scrapy
|
|
80
|
+
|
|
81
|
+
# Sphinx documentation
|
|
82
|
+
docs/_build/
|
|
83
|
+
|
|
84
|
+
# PyBuilder
|
|
85
|
+
.pybuilder/
|
|
86
|
+
target/
|
|
87
|
+
|
|
88
|
+
# Jupyter Notebook
|
|
89
|
+
.ipynb_checkpoints
|
|
90
|
+
|
|
91
|
+
# IPython
|
|
92
|
+
profile_default/
|
|
93
|
+
ipython_config.py
|
|
94
|
+
|
|
95
|
+
# pyenv
|
|
96
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
97
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
98
|
+
# .python-version
|
|
99
|
+
|
|
100
|
+
# pipenv
|
|
101
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
102
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
103
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
104
|
+
# install all needed dependencies.
|
|
105
|
+
#Pipfile.lock
|
|
106
|
+
|
|
107
|
+
# UV
|
|
108
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
109
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
110
|
+
# commonly ignored for libraries.
|
|
111
|
+
#uv.lock
|
|
112
|
+
|
|
113
|
+
# poetry
|
|
114
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
115
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
116
|
+
# commonly ignored for libraries.
|
|
117
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
118
|
+
#poetry.lock
|
|
119
|
+
|
|
120
|
+
# pdm
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
122
|
+
#pdm.lock
|
|
123
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
124
|
+
# in version control.
|
|
125
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
126
|
+
.pdm.toml
|
|
127
|
+
.pdm-python
|
|
128
|
+
.pdm-build/
|
|
129
|
+
|
|
130
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
131
|
+
__pypackages__/
|
|
132
|
+
|
|
133
|
+
# Celery stuff
|
|
134
|
+
celerybeat-schedule
|
|
135
|
+
celerybeat.pid
|
|
136
|
+
|
|
137
|
+
# SageMath parsed files
|
|
138
|
+
*.sage.py
|
|
139
|
+
|
|
140
|
+
# Environments
|
|
141
|
+
.env
|
|
142
|
+
.venv
|
|
143
|
+
env/
|
|
144
|
+
venv/
|
|
145
|
+
ENV/
|
|
146
|
+
env.bak/
|
|
147
|
+
venv.bak/
|
|
148
|
+
|
|
149
|
+
# Spyder project settings
|
|
150
|
+
.spyderproject
|
|
151
|
+
.spyproject
|
|
152
|
+
|
|
153
|
+
# Rope project settings
|
|
154
|
+
.ropeproject
|
|
155
|
+
|
|
156
|
+
# mkdocs documentation
|
|
157
|
+
/site
|
|
158
|
+
|
|
159
|
+
# mypy
|
|
160
|
+
.mypy_cache/
|
|
161
|
+
.dmypy.json
|
|
162
|
+
dmypy.json
|
|
163
|
+
|
|
164
|
+
# Pyre type checker
|
|
165
|
+
.pyre/
|
|
166
|
+
|
|
167
|
+
# pytype static type analyzer
|
|
168
|
+
.pytype/
|
|
169
|
+
|
|
170
|
+
# Cython debug symbols
|
|
171
|
+
cython_debug/
|
|
172
|
+
|
|
173
|
+
# PyCharm
|
|
174
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
175
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
176
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
177
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
178
|
+
#.idea/
|
|
179
|
+
|
|
180
|
+
# PyPI configuration file
|
|
181
|
+
.pypirc
|
repare-0.0.2/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2024 (c) Edward Huang
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
repare-0.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: repare
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Reconstruct ancient pedigrees.
|
|
5
|
+
Author-email: Edward Huang <edwardhuang02@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: matplotlib
|
|
11
|
+
Requires-Dist: networkx
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Requires-Dist: tqdm
|
|
14
|
+
Provides-Extra: benchmark
|
|
15
|
+
Requires-Dist: scikit-learn; extra == "benchmark"
|
|
16
|
+
Requires-Dist: seaborn; extra == "benchmark"
|
|
17
|
+
Provides-Extra: plot
|
|
18
|
+
Requires-Dist: pygraphviz; extra == "plot"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
**repare** is a Python package for (ancient) pedigree reconstruction.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
### Recommended
|
|
26
|
+
```
|
|
27
|
+
conda create -n "repare" -c conda-forge python=3.13 pygraphviz
|
|
28
|
+
conda activate repare
|
|
29
|
+
pip install repare
|
|
30
|
+
```
|
|
31
|
+
repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
|
|
32
|
+
|
|
33
|
+
If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
|
|
34
|
+
|
|
35
|
+
To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
|
repare-0.0.2/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
**repare** is a Python package for (ancient) pedigree reconstruction.
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
### Recommended
|
|
6
|
+
```
|
|
7
|
+
conda create -n "repare" -c conda-forge python=3.13 pygraphviz
|
|
8
|
+
conda activate repare
|
|
9
|
+
pip install repare
|
|
10
|
+
```
|
|
11
|
+
repare uses PyGraphviz to plot reconstructed pedigrees. Since PyGraphviz relies on Graphviz which cannot be installed using `pip`, we recommend installing repare and its dependencies in a fresh conda environment.
|
|
12
|
+
|
|
13
|
+
If you don't need to plot reconstructed pedigrees, you can install repare directly with `pip install repare`. If you need to plot reconstructed pedigrees and have your own Graphviz installation, you can install repare and Pygraphviz with `pip install repare[plot]`.
|
|
14
|
+
|
|
15
|
+
To install conda, see [this page](https://www.anaconda.com/docs/getting-started/miniconda/install). To install PyGraphviz and Graphviz (yourself), see [this page](https://pygraphviz.github.io/documentation/stable/install.html).
|
|
File without changes
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import tempfile
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from itertools import combinations
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.metrics import r2_score
|
|
8
|
+
|
|
9
|
+
from repare.pedigree import Pedigree
|
|
10
|
+
from repare.pedigree_reconstructor import PedigreeReconstructor
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RelationComparison:
|
|
16
|
+
"""
|
|
17
|
+
Generates an algorithm-reconstructed pedigree and compares it to a published/ground-truth pedigree.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, published_relations_path: str, algorithm_nodes_path: str, algorithm_relations_path: str) -> None:
|
|
21
|
+
self._published_relations_path = published_relations_path
|
|
22
|
+
self._algorithm_nodes_path = algorithm_nodes_path
|
|
23
|
+
self._algorithm_relations_path = algorithm_relations_path
|
|
24
|
+
|
|
25
|
+
self._published_relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = (
|
|
26
|
+
self._load_published_relations(self._published_relations_path)
|
|
27
|
+
)
|
|
28
|
+
self._algorithm_relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = (
|
|
29
|
+
self._load_algorithm_relations(self._algorithm_nodes_path, self._algorithm_relations_path)
|
|
30
|
+
)
|
|
31
|
+
self._fill_uncertain_relations()
|
|
32
|
+
|
|
33
|
+
def _load_published_relations(self, path: str) -> defaultdict[tuple[str, str], defaultdict[str, int]]:
|
|
34
|
+
published_relations_df = pd.read_csv(path, comment="#")
|
|
35
|
+
relation_counts: defaultdict[tuple[str, str], defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
36
|
+
for id1, id2, relation in published_relations_df.itertuples(index=False):
|
|
37
|
+
id1, id2, relation = self._sort_relation(id1, id2, relation)
|
|
38
|
+
relation_counts[(id1, id2)][relation] += 1
|
|
39
|
+
return relation_counts
|
|
40
|
+
|
|
41
|
+
def _load_algorithm_relations(
|
|
42
|
+
self, nodes_path: str, relations_path: str
|
|
43
|
+
) -> defaultdict[tuple[str, str], defaultdict[str, int]]:
|
|
44
|
+
self._algorithm_pedigree: Pedigree = self._run_algorithm(nodes_path, relations_path)
|
|
45
|
+
algorithm_relations: defaultdict[tuple[str, str], defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
46
|
+
|
|
47
|
+
for id1, id2 in combinations(self._algorithm_pedigree.node_to_data, 2):
|
|
48
|
+
if not id1.isnumeric() and not id2.isnumeric(): # Skip placeholder nodes
|
|
49
|
+
relations_between_nodes = self._algorithm_pedigree.get_relations_between_nodes(
|
|
50
|
+
id1, id2, include_maternal_paternal=True
|
|
51
|
+
)
|
|
52
|
+
for relation, count in relations_between_nodes.items():
|
|
53
|
+
id1, id2, relation = self._sort_relation(id1, id2, relation)
|
|
54
|
+
algorithm_relations[(id1, id2)][relation] += count
|
|
55
|
+
return algorithm_relations
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def _sort_relation(id1: str, id2: str, relation: str) -> tuple[str, str, str]:
|
|
59
|
+
flipped_relations = {
|
|
60
|
+
"parent-child": "child-parent",
|
|
61
|
+
"child-parent": "parent-child",
|
|
62
|
+
"siblings": "siblings", # Symmetric
|
|
63
|
+
"maternal aunt/uncle-nephew/niece": "maternal nephew/niece-aunt/uncle",
|
|
64
|
+
"maternal nephew/niece-aunt/uncle": "maternal aunt/uncle-nephew/niece",
|
|
65
|
+
"paternal aunt/uncle-nephew/niece": "paternal nephew/niece-aunt/uncle",
|
|
66
|
+
"paternal nephew/niece-aunt/uncle": "paternal aunt/uncle-nephew/niece",
|
|
67
|
+
"maternal grandparent-grandchild": "maternal grandchild-grandparent",
|
|
68
|
+
"maternal grandchild-grandparent": "maternal grandparent-grandchild",
|
|
69
|
+
"paternal grandparent-grandchild": "paternal grandchild-grandparent",
|
|
70
|
+
"paternal grandchild-grandparent": "paternal grandparent-grandchild",
|
|
71
|
+
"maternal half-siblings": "maternal half-siblings", # Symmetric
|
|
72
|
+
"paternal half-siblings": "paternal half-siblings", # Symmetric
|
|
73
|
+
"1": "1", # Symmetric
|
|
74
|
+
"2": "2", # Symmetric
|
|
75
|
+
}
|
|
76
|
+
if id2 < id1:
|
|
77
|
+
return id2, id1, flipped_relations[relation]
|
|
78
|
+
else:
|
|
79
|
+
return id1, id2, relation
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def _run_algorithm(nodes_path: str, relations_path: str) -> Pedigree:
|
|
83
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
84
|
+
pedigree_reconstructor = PedigreeReconstructor(
|
|
85
|
+
relations_path, nodes_path, outputs_dir=temp_dir, max_candidate_pedigrees=1000, plot=False
|
|
86
|
+
)
|
|
87
|
+
return pedigree_reconstructor.find_best_pedigree()
|
|
88
|
+
|
|
89
|
+
def _fill_uncertain_relations(self) -> None:
|
|
90
|
+
uncertain_to_exact_relations = {
|
|
91
|
+
"1": ["parent-child", "child-parent", "siblings"],
|
|
92
|
+
"2": [
|
|
93
|
+
"maternal aunt/uncle-nephew/niece",
|
|
94
|
+
"maternal nephew/niece-aunt/uncle",
|
|
95
|
+
"paternal aunt/uncle-nephew/niece",
|
|
96
|
+
"paternal nephew/niece-aunt/uncle",
|
|
97
|
+
"maternal grandparent-grandchild",
|
|
98
|
+
"maternal grandchild-grandparent",
|
|
99
|
+
"paternal grandparent-grandchild",
|
|
100
|
+
"paternal grandchild-grandparent",
|
|
101
|
+
"maternal half-siblings",
|
|
102
|
+
"paternal half-siblings",
|
|
103
|
+
],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
for (id1, id2), relation_counts_between_nodes in self._published_relation_counts.items():
|
|
107
|
+
for uncertain_relation, count in list(relation_counts_between_nodes.items()): # Cast to list to copy items
|
|
108
|
+
if uncertain_relation not in uncertain_to_exact_relations:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
for exact_relation in uncertain_to_exact_relations[uncertain_relation]:
|
|
112
|
+
available_count = self._algorithm_relation_counts[(id1, id2)][exact_relation]
|
|
113
|
+
assign_count = min(count, available_count)
|
|
114
|
+
self._published_relation_counts[(id1, id2)][exact_relation] += assign_count
|
|
115
|
+
self._published_relation_counts[(id1, id2)][uncertain_relation] -= assign_count
|
|
116
|
+
|
|
117
|
+
count -= assign_count
|
|
118
|
+
if count == 0:
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
def get_metrics(self) -> dict[str, float]:
|
|
122
|
+
metrics: dict[str, float] = dict()
|
|
123
|
+
pairwise_relation_accuracy, relation_precision, relation_recall, relation_f1 = (
|
|
124
|
+
self._calculate_relation_metrics()
|
|
125
|
+
)
|
|
126
|
+
pairwise_degree_accuracy, degree_precision, degree_recall, degree_f1 = self._calculate_degree_metrics()
|
|
127
|
+
|
|
128
|
+
metrics["Pairwise Relation Accuracy"] = pairwise_relation_accuracy
|
|
129
|
+
metrics["Relation Precision"] = relation_precision
|
|
130
|
+
metrics["Relation Recall"] = relation_recall
|
|
131
|
+
metrics["Relation F1"] = relation_f1
|
|
132
|
+
metrics["Pairwise Degree Accuracy"] = pairwise_degree_accuracy
|
|
133
|
+
metrics["Degree Precision"] = degree_precision
|
|
134
|
+
metrics["Degree Recall"] = degree_recall
|
|
135
|
+
metrics["Degree F1"] = degree_f1
|
|
136
|
+
metrics["Connectivity R-squared"] = self._calculate_connectivity_r_squared()
|
|
137
|
+
metrics["Kinship Inference Errors"] = self._calculate_kinship_inference_errors()
|
|
138
|
+
return metrics
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _calculate_tp_fp_fn(
|
|
142
|
+
published_counts: defaultdict(int), algorithm_counts: defaultdict(int), nodes: tuple[str, str]
|
|
143
|
+
) -> tuple[int, int, int]:
|
|
144
|
+
tp = 0 # True positives
|
|
145
|
+
fp = 0 # False positives
|
|
146
|
+
fn = 0 # False negatives
|
|
147
|
+
relations = published_counts.keys() | algorithm_counts.keys()
|
|
148
|
+
for relation in relations:
|
|
149
|
+
true_count = published_counts[relation]
|
|
150
|
+
algorithm_count = algorithm_counts[relation]
|
|
151
|
+
|
|
152
|
+
if true_count == algorithm_count:
|
|
153
|
+
tp += true_count
|
|
154
|
+
elif true_count > algorithm_count:
|
|
155
|
+
tp += algorithm_count
|
|
156
|
+
fn += true_count - algorithm_count
|
|
157
|
+
logger.info(f"False Negative: {nodes[0]} - {nodes[1]}: {relation} ({true_count} > {algorithm_count})")
|
|
158
|
+
else:
|
|
159
|
+
tp += true_count
|
|
160
|
+
fp += algorithm_count - true_count
|
|
161
|
+
logger.info(f"False Positive: {nodes[0]} - {nodes[1]}: {relation} ({true_count} < {algorithm_count})")
|
|
162
|
+
return tp, fp, fn
|
|
163
|
+
|
|
164
|
+
def _calculate_relation_metrics(self) -> tuple[float, float, float, float]:
|
|
165
|
+
correct_node_pairs: int = 0
|
|
166
|
+
total_node_pairs: int = 0
|
|
167
|
+
relation_tp: int = 0
|
|
168
|
+
relation_fp: int = 0
|
|
169
|
+
relation_fn: int = 0
|
|
170
|
+
|
|
171
|
+
nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
|
|
172
|
+
for id1, id2 in combinations(sorted(nodes), 2):
|
|
173
|
+
published_relations_between_nodes = self._published_relation_counts[(id1, id2)]
|
|
174
|
+
algorithm_relations_between_nodes = self._algorithm_relation_counts[(id1, id2)]
|
|
175
|
+
|
|
176
|
+
if published_relations_between_nodes == algorithm_relations_between_nodes:
|
|
177
|
+
correct_node_pairs += 1
|
|
178
|
+
total_node_pairs += 1
|
|
179
|
+
|
|
180
|
+
tp, fp, fn = self._calculate_tp_fp_fn(
|
|
181
|
+
published_relations_between_nodes, algorithm_relations_between_nodes, (id1, id2)
|
|
182
|
+
)
|
|
183
|
+
relation_tp += tp
|
|
184
|
+
relation_fp += fp
|
|
185
|
+
relation_fn += fn
|
|
186
|
+
|
|
187
|
+
pairwise_relation_accuracy = correct_node_pairs / total_node_pairs
|
|
188
|
+
relation_precision = relation_tp / (relation_tp + relation_fp)
|
|
189
|
+
relation_recall = relation_tp / (relation_tp + relation_fn)
|
|
190
|
+
relation_f1 = 2 * (relation_precision * relation_recall) / (relation_precision + relation_recall)
|
|
191
|
+
relation_f1 = (
|
|
192
|
+
(2 * relation_precision * relation_recall) / (relation_precision + relation_recall)
|
|
193
|
+
if relation_precision + relation_recall > 0
|
|
194
|
+
else 0
|
|
195
|
+
)
|
|
196
|
+
return pairwise_relation_accuracy, relation_precision, relation_recall, relation_f1
|
|
197
|
+
|
|
198
|
+
def _calculate_degree_metrics(self) -> tuple[float, float, float, float]:
|
|
199
|
+
correct_node_pairs: int = 0
|
|
200
|
+
total_node_pairs: int = 0
|
|
201
|
+
degree_tp: int = 0
|
|
202
|
+
degree_fp: int = 0
|
|
203
|
+
degree_fn: int = 0
|
|
204
|
+
|
|
205
|
+
nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
|
|
206
|
+
for id1, id2 in combinations(sorted(nodes), 2):
|
|
207
|
+
published_relations_between_nodes = self._published_relation_counts[(id1, id2)]
|
|
208
|
+
algorithm_relations_between_nodes = self._algorithm_relation_counts[(id1, id2)]
|
|
209
|
+
|
|
210
|
+
published_degrees_between_nodes = defaultdict(int)
|
|
211
|
+
algorithm_degrees_between_nodes = defaultdict(int)
|
|
212
|
+
for relation in ["parent-child", "child-parent", "siblings"]:
|
|
213
|
+
published_degrees_between_nodes["1"] += published_relations_between_nodes[relation]
|
|
214
|
+
algorithm_degrees_between_nodes["1"] += algorithm_relations_between_nodes[relation]
|
|
215
|
+
|
|
216
|
+
for relation in [
|
|
217
|
+
"maternal aunt/uncle-nephew/niece",
|
|
218
|
+
"paternal aunt/uncle-nephew/niece",
|
|
219
|
+
"maternal nephew/niece-aunt/uncle",
|
|
220
|
+
"paternal nephew/niece-aunt/uncle",
|
|
221
|
+
"maternal grandparent-grandchild",
|
|
222
|
+
"paternal grandparent-grandchild",
|
|
223
|
+
"maternal grandchild-grandparent",
|
|
224
|
+
"paternal grandchild-grandparent",
|
|
225
|
+
"maternal half-siblings",
|
|
226
|
+
"paternal half-siblings",
|
|
227
|
+
]:
|
|
228
|
+
published_degrees_between_nodes["2"] += published_relations_between_nodes[relation]
|
|
229
|
+
algorithm_degrees_between_nodes["2"] += algorithm_relations_between_nodes[relation]
|
|
230
|
+
|
|
231
|
+
if published_degrees_between_nodes == algorithm_degrees_between_nodes:
|
|
232
|
+
correct_node_pairs += 1
|
|
233
|
+
total_node_pairs += 1
|
|
234
|
+
|
|
235
|
+
tp, fp, fn = self._calculate_tp_fp_fn(
|
|
236
|
+
published_degrees_between_nodes, algorithm_degrees_between_nodes, (id1, id2)
|
|
237
|
+
)
|
|
238
|
+
degree_tp += tp
|
|
239
|
+
degree_fp += fp
|
|
240
|
+
degree_fn += fn
|
|
241
|
+
|
|
242
|
+
pairwise_degree_accuracy = correct_node_pairs / total_node_pairs
|
|
243
|
+
degree_precision = degree_tp / (degree_tp + degree_fp)
|
|
244
|
+
degree_recall = degree_tp / (degree_tp + degree_fn)
|
|
245
|
+
degree_f1 = (
|
|
246
|
+
(2 * degree_precision * degree_recall) / (degree_precision + degree_recall)
|
|
247
|
+
if degree_precision + degree_recall > 0
|
|
248
|
+
else 0
|
|
249
|
+
)
|
|
250
|
+
return pairwise_degree_accuracy, degree_precision, degree_recall, degree_f1
|
|
251
|
+
|
|
252
|
+
def _calculate_connectivity_r_squared(self) -> float:
|
|
253
|
+
published_relation_counter: defaultdict[str, int] = defaultdict(int)
|
|
254
|
+
algorithm_relation_counter: defaultdict[str, int] = defaultdict(int)
|
|
255
|
+
|
|
256
|
+
nodes = [node for node in self._algorithm_pedigree.node_to_data if not node.isnumeric()]
|
|
257
|
+
for node1, node2 in combinations(sorted(nodes), 2):
|
|
258
|
+
if not node1.isnumeric() and not node2.isnumeric():
|
|
259
|
+
relations_between_nodes = self._published_relation_counts[(node1, node2)]
|
|
260
|
+
for relation, count in relations_between_nodes.items():
|
|
261
|
+
published_relation_counter[relation] += count
|
|
262
|
+
|
|
263
|
+
relations_between_nodes = self._algorithm_relation_counts[(node1, node2)]
|
|
264
|
+
for relation, count in relations_between_nodes.items():
|
|
265
|
+
algorithm_relation_counter[relation] += count
|
|
266
|
+
|
|
267
|
+
published_connectivities: list[int] = []
|
|
268
|
+
algorithm_connectivities: list[int] = []
|
|
269
|
+
for node in nodes:
|
|
270
|
+
published_connectivities.append(published_relation_counter[node])
|
|
271
|
+
algorithm_connectivities.append(algorithm_relation_counter[node])
|
|
272
|
+
return r2_score(published_connectivities, algorithm_connectivities)
|
|
273
|
+
|
|
274
|
+
def _calculate_kinship_inference_errors(self) -> int:
|
|
275
|
+
"""
|
|
276
|
+
Calculate the number of node pairs that share a different inferred kinship degree than in the published pedigree
|
|
277
|
+
or share a relation constraint not consistent with the published pedigree.
|
|
278
|
+
"""
|
|
279
|
+
published_exact_relations = pd.read_csv(
|
|
280
|
+
self._published_relations_path, dtype=str, comment="#", keep_default_na=False
|
|
281
|
+
)
|
|
282
|
+
inferred_relations = pd.read_csv(self._algorithm_relations_path, dtype=str, comment="#", keep_default_na=False)
|
|
283
|
+
|
|
284
|
+
first_degree_relations = {"parent-child", "child-parent", "siblings", "1"}
|
|
285
|
+
second_degree_relations = {
|
|
286
|
+
"maternal aunt/uncle-nephew/niece",
|
|
287
|
+
"paternal aunt/uncle-nephew/niece",
|
|
288
|
+
"maternal nephew/niece-aunt/uncle",
|
|
289
|
+
"paternal nephew/niece-aunt/uncle",
|
|
290
|
+
"maternal grandparent-grandchild",
|
|
291
|
+
"paternal grandparent-grandchild",
|
|
292
|
+
"maternal grandchild-grandparent",
|
|
293
|
+
"paternal grandchild-grandparent",
|
|
294
|
+
"maternal half-siblings",
|
|
295
|
+
"paternal half-siblings",
|
|
296
|
+
"2",
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
pair_to_published_degree = {}
|
|
300
|
+
for id1, id2, relation in published_exact_relations.itertuples(index=False):
|
|
301
|
+
if relation in first_degree_relations:
|
|
302
|
+
degree = "1"
|
|
303
|
+
elif relation in second_degree_relations:
|
|
304
|
+
degree = "2"
|
|
305
|
+
|
|
306
|
+
if degree:
|
|
307
|
+
pair_to_published_degree[tuple(sorted((id1, id2)))] = degree
|
|
308
|
+
|
|
309
|
+
pair_to_inferred_degree = {}
|
|
310
|
+
pair_to_inferred_constraints = {}
|
|
311
|
+
for id1, id2, degree, constraints in inferred_relations.itertuples(index=False):
|
|
312
|
+
if degree == "1" or degree == "2":
|
|
313
|
+
pair_to_inferred_degree[tuple(sorted((id1, id2)))] = degree
|
|
314
|
+
if constraints:
|
|
315
|
+
pair_to_inferred_constraints[tuple(sorted((id1, id2)))] = set(constraints.split(";"))
|
|
316
|
+
|
|
317
|
+
# Compare the degree dicts
|
|
318
|
+
kinship_inference_errors = 0
|
|
319
|
+
for pair, algorithm_degree in pair_to_inferred_degree.items():
|
|
320
|
+
if pair not in pair_to_published_degree:
|
|
321
|
+
kinship_inference_errors += 1
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
published_degree = pair_to_published_degree[pair]
|
|
325
|
+
if algorithm_degree != published_degree:
|
|
326
|
+
kinship_inference_errors += 1
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
for pair in pair_to_published_degree:
|
|
330
|
+
if pair not in pair_to_inferred_degree:
|
|
331
|
+
kinship_inference_errors += 1
|
|
332
|
+
|
|
333
|
+
# Count first-degree exact relation inference errors
|
|
334
|
+
for id1, id2, relation in published_exact_relations.itertuples(index=False):
|
|
335
|
+
if relation == "1" or relation == "2": # Skip "dotted lines"
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
pair = tuple(sorted((id1, id2)))
|
|
339
|
+
if pair in pair_to_inferred_constraints and relation not in pair_to_inferred_constraints[pair]:
|
|
340
|
+
assert relation in first_degree_relations
|
|
341
|
+
kinship_inference_errors += 1
|
|
342
|
+
return kinship_inference_errors
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
@article{Blöcher2023,
|
|
2
|
+
author = {Blöcher, Jens and Brami, Maxime and Feinauer, Isabelle Sofie
|
|
3
|
+
and Stolarczyk, Eliza and Diekmann, Yoan and Vetterdietz, Lisa and
|
|
4
|
+
Karapetian, Marina and Winkelbach, Laura and Kokot, Vanessa and
|
|
5
|
+
Vallini, Leonardo and Stobbe, Astrid and Haak, Wolfgang and
|
|
6
|
+
Papageorgopoulou, Christina and Krause, Rüdiger and Sharapova,
|
|
7
|
+
Svetlana and Burger, Joachim},
|
|
8
|
+
publisher = {Proceedings of the National Academy of Sciences},
|
|
9
|
+
title = {Descent, Marriage, and Residence Practices of a
|
|
10
|
+
3,800-Year-Old Pastoral Community in {Central} {Eurasia}},
|
|
11
|
+
journal = {Proceedings of the National Academy of Sciences},
|
|
12
|
+
volume = {120},
|
|
13
|
+
number = {36},
|
|
14
|
+
pages = {e2303574120},
|
|
15
|
+
url = {https://doi.org/10.1073/pnas.2303574120}
|
|
16
|
+
}
|