ibdpainting 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Tom Ellis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: ibdpainting
3
+ Version: 0.1
4
+ Summary: Identify parents of a crossed individual by comparing identity in windows across their genomes
5
+ Home-page: https://github.com/ellisztamas/ibdpainting
6
+ Author: Tom Ellis
7
+ Author-email: thomas.ellis@gmi.oeaw.ac.at
8
+ License: MIT
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: numpy
12
+ Requires-Dist: pandas
13
+ Requires-Dist: plotly
14
+ Requires-Dist: h5py
15
+ Requires-Dist: scikit-allel
16
+
17
+ # ibdpainting
18
+ Identify parents of a crossed individual by comparing identity in windows across their genomes
@@ -0,0 +1,2 @@
1
+ # ibdpainting
2
+ Identify parents of a crossed individual by comparing identity in windows across their genomes
@@ -0,0 +1,8 @@
1
+ """Top-level package for methlab."""
2
+
3
+ __author__ = """Tom Ellis"""
4
+ __email__ = 'thomas.ellis@gmi.oeaw.ac.at'
5
+ __version__ = '0.1'
6
+
7
+
8
+ from ibdpainting import *
@@ -0,0 +1,127 @@
1
+ import numpy as np
2
+ from warnings import warn
3
+ import numpy.ma as ma
4
+
5
+ class geneticDistance(object):
6
+ """
7
+ A simple class to compare genotype data genetic distances between individuals.
8
+
9
+ Parameters
10
+ ==========
11
+ samples: array
12
+ Vector of length m giving names for each sample.
13
+ chr: array
14
+ Vector of length n giving chromosome labels for each SNP.
15
+ pos: array
16
+ Vector of length n giving SNP positions. Note that SNP positions are inherited from
17
+ skikit allel and give row numbers from the input VCF file rather than
18
+ base-pair positions on each chromosome.
19
+ geno: array
20
+ m x n x 2 array of genotype data where axes index SNPs, individuals, and
21
+ homologous chromosomes.
22
+
23
+ Attributes
24
+ ==========
25
+ samples: array
26
+ Vector of M sample names. The first sample is the input individual to be
27
+ compared to the remaining reference individuals.
28
+ chr: array
29
+ Vector of chromosome labels. These are imported from the reference panel.
30
+ pos: array
31
+ Vector of N SNP positions. These are imported from the reference panel.
32
+ geno: array
33
+ NxMx2 array of genotype data, where N is the number of SNPs and M is the
34
+ number of samples.
35
+
36
+ Methods
37
+ =======
38
+ split_into_windows
39
+ Split a geneticDistance object into windows.
40
+ pairwise_distance
41
+ Calculate pairwise genetic distance between an input individual and all
42
+ reference individuals.
43
+
44
+ """
45
+ def __init__(self, samples, chr, pos, geno):
46
+ self.samples = samples
47
+ self.chr = chr
48
+ self.pos = pos
49
+ self.geno = geno
50
+
51
+ def split_into_windows(self, window_size: int):
52
+ """
53
+ Split a geneticDistance object into windows.
54
+
55
+ Splits the geneticDistance object into chromosomes, then into windows on each
56
+ chromosome. It returns a dictionary of geneticDistance objects for each window.
57
+
58
+ Parameters
59
+ ==========
60
+ window_size: int
61
+ Window size in base pairs.
62
+
63
+ Returns
64
+ =======
65
+ A dictionary of geneticDistance objects with an element for each window.
66
+ Indexes are in the form "Chr:start-stop".
67
+ """
68
+ # Empty dict to store the output
69
+ list_of_distance_objects = {}
70
+
71
+ for chr in np.unique(self.chr):
72
+ # Boolean array indexing items in this chromosome
73
+ chr_ix = self.chr == chr
74
+
75
+ # Array of starting positions for each window.
76
+ start_positions = np.arange(0, self.pos[chr_ix].max(), window_size)
77
+ for start in start_positions:
78
+ stop = start + window_size
79
+ # Index positions of SNPs within the current window
80
+ window_ix = (self.pos[chr_ix] >= start) & (self.pos[chr_ix] < stop)
81
+ # Create an object for each window.
82
+ window_name = str(chr) + ":" + str(start) + "-" + str(stop)
83
+ list_of_distance_objects[window_name] = geneticDistance(
84
+ samples = self.samples,
85
+ chr = self.chr[chr_ix][window_ix],
86
+ pos = self.pos[chr_ix][window_ix],
87
+ geno = self.geno[chr_ix][window_ix]
88
+ )
89
+
90
+ return list_of_distance_objects
91
+
92
+ def pairwise_distance(self, warn_about_missing_data=False):
93
+ """
94
+ Calculate pairwise genetic distance between an input individual and all
95
+ reference individuals.
96
+
97
+ The input individual is always the first in the list of samples. Genetic
98
+ distance is the number of allelic differences at each locus between each
99
+ pair, summed over all loci. The calculation is done using masked arrays to
100
+ account for missing data.
101
+
102
+ Returns
103
+ =======
104
+ Vector of distances
105
+
106
+ """
107
+ masked_geno = ma.masked_array(self.geno, self.geno < 0)
108
+
109
+ # Calculate differences at each locus
110
+ per_locus_difference = abs(masked_geno.sum(2)[:,[0]] - masked_geno.sum(2)[:,1:]) / 2
111
+ # Average over loci
112
+ dxy = per_locus_difference.mean(0)
113
+
114
+ if warn_about_missing_data and any(dxy.mask):
115
+ warn("""
116
+
117
+ Pairwise distance could not be calculated for one or more comparisons,
118
+ probably because there is missing data at all SNPs.
119
+ The following samples in the reference panel are affected:
120
+ {}
121
+
122
+ """.format(self.samples[1:][dxy.mask])
123
+ )
124
+ # Return a vector of -9 t indicate missing data
125
+ return np.zeros(len(self.samples)-1) -9
126
+
127
+ return dxy.data
@@ -0,0 +1,47 @@
1
+ import pandas as pd
2
+
3
+ from ibdpainting import load_genomes
4
+
5
+ def ibd_table(input:str, reference:str, sample_name:str, window_size:int):
6
+ """
7
+ Compare allele sharing across the genome.
8
+
9
+ Calculate genetic distance between a test individual and a panel of
10
+ reference genomes.
11
+
12
+ Parameters
13
+ ==========
14
+ input: str
15
+ Path to a VCF file containing genotype data for one or more samples to
16
+ test
17
+ reference: str
18
+ Path to an HDF5 file containing genotype data for a panel of reference
19
+ individuals to compare the test individual against.
20
+ sample_name: str
21
+ Sample name for the individual to check.
22
+ This must be present in the samples in the input VCF.
23
+ window_size: int
24
+ Window size in base pairs.
25
+
26
+ Returns
27
+ =======
28
+ DataFrame with a row for each window in the genome and a column for each
29
+ sample in the reference panel. Elements show genetic distance between the
30
+ test individual and each reference individual in a single window.
31
+ """
32
+ genetic_distance = load_genotypes(
33
+ input = input,
34
+ reference = reference,
35
+ sample_name = sample_name
36
+ )
37
+ # Divide the genome into windows
38
+ distances_in_windows = genetic_distance.split_into_windows(window_size)
39
+
40
+ # Dataframe with a row for each window across the genome and a column for each sample in the reference panel.
41
+ distance_array = pd.DataFrame(
42
+ [ v.pairwise_distance() for v in distances_in_windows.values() ],
43
+ columns = genetic_distance.samples[1:]
44
+ )
45
+ distance_array.insert(0, 'window', distances_in_windows.keys())
46
+
47
+ return distance_array
@@ -0,0 +1,36 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ def ibd_scores(ibd_table):
5
+ """
6
+ Mean IBD across the genome.
7
+
8
+ Calculate mean genetic distance from a test individual to each of a panel of
9
+ reference samples, ignoring windows where there was only missing data.
10
+
11
+ Parameters
12
+ ==========
13
+ ibd_table: pd.DataFrame
14
+ DataFrame with a row for each window in the genome and a column for each
15
+ sample in the reference panel. Elements show genetic distance between the
16
+ test individual and each reference individual in a single window.
17
+ This is generated by ibd_table().
18
+
19
+ Returns
20
+ =======
21
+ A DataFrame with a row for each candidate in the reference panel, and a
22
+ column indicating mean genetic distance over windows across the genome.
23
+ Values closer to zero indicate that the sample is more likely to be a match.
24
+ """
25
+ # Coerce missing data to NaN for correct column means.
26
+ ibd_table = ibd_table.replace(-9,np.NaN)
27
+
28
+ # Get column-mean IBD for each candidate, allowing for missing data
29
+ ibd_scores_for_each_candidate = np.array(
30
+ [ np.nanmean(ibd_table[col]) for col in ibd_table.keys()[1:] ]
31
+ )
32
+ scores = pd.DataFrame({
33
+ 'candidate': ibd_table.keys()[1:],
34
+ 'score' : ibd_scores_for_each_candidate
35
+ })
36
+ return scores
@@ -0,0 +1,99 @@
1
+ import allel
2
+ import h5py
3
+ import numpy as np
4
+ from ibdpainting import geneticDistance
5
+
6
+ def load_genotypes(input, reference, sample_name):
7
+ """
8
+ Import and merge test and reference data files.
9
+
10
+ Import genotype data for one or more input samples and a panel of reference samples
11
+ to compare to. Subset each so that the markers are really identical. Merge the
12
+ arrays of genotype calls so that the data for the input appear first on the
13
+ first axis of the genotype call arrays.
14
+
15
+ Parameters
16
+ ==========
17
+ input: str
18
+ Path to a VCF file containing genotype data for one or more samples to check
19
+ reference: str
20
+ Path to a HDF5 file containing genotype data for a panel of reference individuals
21
+ to compare the input indivual against.
22
+ sample_name: str
23
+ Sample name for the individual to check. This must be present in the samples
24
+ in the input VCF.
25
+
26
+ Return
27
+ ======
28
+ An object of class geneticDistance.
29
+ """
30
+ # Read in the data files
31
+ input_vcf = allel.read_vcf(input, samples=[sample_name])
32
+ ref_hdf5 = h5py.File(reference, mode="r")
33
+
34
+ ref_str_data = {
35
+ 'samples' : [ x.decode('utf-8') for x in ref_hdf5['samples'][:] ],
36
+ 'chr' : [ x.decode('utf-8') for x in ref_hdf5['variants/CHROM'][:] ]
37
+ }
38
+
39
+ if sample_name not in input_vcf['samples']:
40
+ raise ValueError("The sample name is not in the list of samples in the input VCF file.")
41
+ else:
42
+ # Find the position of the individual to test
43
+ sample_ix = np.where(input_vcf['samples'] == sample_name)[0][0]
44
+ # Join vectors of sample names, with the test individual first
45
+ new_samples = np.append(
46
+ input_vcf['samples'][None,sample_ix],
47
+ ref_str_data['samples']
48
+ )
49
+
50
+ # Check that contig labels match
51
+ chr_labels = {
52
+ 'input' : np.unique(input_vcf['variants/CHROM']),
53
+ 'ref' : np.unique(ref_str_data['chr'])
54
+ }
55
+ if len(chr_labels['input']) != len(chr_labels['ref']):
56
+ raise ValueError(
57
+ "The number of unique contig labels do not match: the input VCF has {}, but the reference panel has {}.".
58
+ format( chr_labels['input'], chr_labels['ref'] )
59
+ )
60
+ elif any( chr_labels['input'] != chr_labels['ref'] ):
61
+ raise ValueError(
62
+ "Contig labels do not match between the input and reference files."
63
+ )
64
+
65
+ # Make sure we only compare SNPs that are found in both datasets.
66
+ # Concatenate chromosome labels and SNP positions
67
+ snp_names = {
68
+ 'input' : [ str(chr) + ":" + str(pos) for chr,pos in zip(input_vcf['variants/CHROM'], input_vcf['variants/POS']) ],
69
+ 'ref' : [ str(chr) + ":" + str(pos) for chr,pos in zip(ref_str_data['chr'], ref_hdf5['variants/POS'][:]) ]
70
+ }
71
+ # Find the SNP position names that are common to both datasets
72
+ matching_SNPs_in_both_files = np.intersect1d(
73
+ snp_names['input'],
74
+ snp_names['ref']
75
+ )
76
+ which_SNPs_to_keep = {
77
+ "input" : [ x in matching_SNPs_in_both_files for x in snp_names['input'] ],
78
+ "ref" : [ x in matching_SNPs_in_both_files for x in snp_names['ref'] ]
79
+ }
80
+
81
+
82
+ # Append the genotype data for the test individual to the array of the reference panel
83
+ new_geno = np.concatenate(
84
+ (input_vcf['calldata/GT'][which_SNPs_to_keep['input'], sample_ix][:, np.newaxis],
85
+ ref_hdf5['calldata/GT'][which_SNPs_to_keep['ref']]),
86
+ axis=1
87
+ )
88
+
89
+ # Define an output before closing the Hdf5 file
90
+ output = geneticDistance(
91
+ samples = new_samples,
92
+ chr = np.array(ref_str_data['chr'])[np.where(which_SNPs_to_keep['ref'])[0]],
93
+ pos = ref_hdf5['variants/POS'][:][which_SNPs_to_keep['ref']],
94
+ geno = new_geno
95
+ )
96
+
97
+ ref_hdf5.close()
98
+
99
+ return output
@@ -0,0 +1,80 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ import plotly.express as px
5
+
6
+
7
+ def plot_ibd_table(ibd_table:pd.DataFrame, sample_name:str, expected_match:list=[], max_to_plot=20):
8
+ """
9
+ Plot allele sharing across the genome.
10
+
11
+
12
+ Create a interactive line graph showing genetic distance from a test
13
+ individual to each sample in a panel of reference individuals.
14
+
15
+ Parameters
16
+ ==========
17
+ ibd_table: pd.DataFrame
18
+ DataFrame with a row for each window in the genome and a column for each
19
+ sample in the reference panel. Elements show genetic distance between the
20
+ test individual and each reference individual in a single window.
21
+ This is generated by ibd_table().
22
+ sample_name: str
23
+ Sample name for the individual to check.
24
+ This must be present in the samples in the input VCF.
25
+ expected_match: list
26
+ List of sample names in the reference panel that are expected to be
27
+ ancestors of the test individual.
28
+
29
+ Returns
30
+ =======
31
+ Plotly figure object with subplots for each chromosome, showing window
32
+ position along the x-axis and genetic distance from the test individual to
33
+ each reference sample on the y-axis. Line colour indicates whether a sample
34
+ is an expected parent or not. Rolling over the lines shows which sample is
35
+ which.
36
+ """
37
+
38
+
39
+ # Coerce missing data to NaN for correct column means.
40
+ ibd_table = ibd_table.replace(-9,np.NaN)
41
+
42
+ # Identify the candidate names *not* among the top `max_to_plot` columns and remove
43
+ # If `max_to_plot` is less than the number of candidates.
44
+ if max_to_plot < ibd_table.shape[1]-1:
45
+ # Get column-mean IBD for each candidate, allowing for missing data
46
+ ibd_scores_for_each_candidate = np.array(
47
+ [ np.nanmean(ibd_table[col]) for col in ibd_table.keys()[1:] ]
48
+ )
49
+ # Identify the candidate names *not* among the top `max_to_plot` columns
50
+ ix = np.argpartition(ibd_scores_for_each_candidate, max_to_plot)[max_to_plot:] # index positions
51
+ columns_to_drop = ibd_table.keys()[ix+1].to_list() # candidate names
52
+ ibd_table = ibd_table.drop(columns=columns_to_drop) # drop the candidates
53
+
54
+ # Make the table long
55
+ ibd_table = ibd_table.melt(id_vars=['window'], var_name='candidate', value_name='distance')
56
+ # Column indicating which candidates should be plotted a different colour.
57
+ ibd_table['expected'] = ibd_table['candidate'].isin(expected_match)
58
+ # Split the 'window' column up into separate columns for chromosome, start and stop positions
59
+ ibd_table[['chr', 'window']] = ibd_table['window'].str.split(":", expand=True)
60
+ ibd_table[['start', 'stop']] = ibd_table['window'].str.split("-", expand=True)
61
+ # start and stop positions should be integers for sensible plotting.
62
+ ibd_table['start'] = ibd_table['start'].astype(int)
63
+ ibd_table['stop'] = ibd_table['stop'].astype(int)
64
+ ibd_table['midpoint'] = (ibd_table['start'] + ibd_table['stop']) / 2
65
+
66
+ fig = px.line(
67
+ ibd_table,
68
+ x="midpoint", y="distance", color="expected",
69
+ title=sample_name,
70
+ labels={
71
+ 'midpoint' : 'Position (bp)',
72
+ 'distance' : 'Genetic distance'
73
+ },
74
+ hover_data=['candidate'],
75
+ category_orders={'expected': [False, True]},
76
+ facet_row = "chr"
77
+ )
78
+ fig.update_traces(mode="markers+lines")
79
+
80
+ return fig
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: ibdpainting
3
+ Version: 0.1
4
+ Summary: Identify parents of a crossed individual by comparing identity in windows across their genomes
5
+ Home-page: https://github.com/ellisztamas/ibdpainting
6
+ Author: Tom Ellis
7
+ Author-email: thomas.ellis@gmi.oeaw.ac.at
8
+ License: MIT
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: numpy
12
+ Requires-Dist: pandas
13
+ Requires-Dist: plotly
14
+ Requires-Dist: h5py
15
+ Requires-Dist: scikit-allel
16
+
17
+ # ibdpainting
18
+ Identify parents of a crossed individual by comparing identity in windows across their genomes
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ ibdpainting/__init__.py
5
+ ibdpainting/genetic_distance.py
6
+ ibdpainting/ibd_table.py
7
+ ibdpainting/ibs_scores.py
8
+ ibdpainting/load_genomes.py
9
+ ibdpainting/plot_ibd_table.py
10
+ ibdpainting.egg-info/PKG-INFO
11
+ ibdpainting.egg-info/SOURCES.txt
12
+ ibdpainting.egg-info/dependency_links.txt
13
+ ibdpainting.egg-info/not-zip-safe
14
+ ibdpainting.egg-info/requires.txt
15
+ ibdpainting.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ numpy
2
+ pandas
3
+ plotly
4
+ h5py
5
+ scikit-allel
@@ -0,0 +1 @@
1
+ ibdpainting
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,41 @@
1
+ from setuptools import setup
2
+ import codecs
3
+ import os.path
4
+ from pathlib import Path
5
+
6
+ # Functions to pull the package version from init.py
7
+ def read(rel_path):
8
+ here = os.path.abspath(os.path.dirname(__file__))
9
+ with codecs.open(os.path.join(here, rel_path), 'r') as fp:
10
+ return fp.read()
11
+
12
+ def get_version(rel_path):
13
+ for line in read(rel_path).splitlines():
14
+ if line.startswith('__version__'):
15
+ delim = '"' if '"' in line else "'"
16
+ return line.split(delim)[1]
17
+ else:
18
+ raise RuntimeError("Unable to find version string.")
19
+
20
+ # read the contents of your README file
21
+ this_directory = Path(__file__).parent
22
+ long_description = (this_directory / "README.md").read_text()
23
+
24
+
25
+ setup(
26
+ name='ibdpainting',
27
+ version=get_version("ibdpainting/__init__.py"),
28
+ description='Identify parents of a crossed individual by comparing identity in windows across their genomes',
29
+ url='https://github.com/ellisztamas/ibdpainting',
30
+ long_description=long_description,
31
+ long_description_content_type='text/markdown',
32
+ author='Tom Ellis',
33
+ author_email='thomas.ellis@gmi.oeaw.ac.at',
34
+ license='MIT',
35
+ packages=['ibdpainting'],
36
+ install_requires=[
37
+ 'numpy', 'pandas', 'plotly', 'h5py', 'scikit-allel'
38
+ ],
39
+ zip_safe=False
40
+ )
41
+