PyPI - snplib - Versions diffs - 1.0.0__py3-none-any.whl - Mend

snplib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

finalreport/__init__.py +7 -0
finalreport/_finalreport.py +251 -0
finalreport/tests/__init__.py +7 -0
finalreport/tests/test_finalreport.py +215 -0
format/__init__.py +19 -0
format/__settings.py +7 -0
format/_plink.py +305 -0
format/_snp.py +113 -0
format/tests/__init__.py +7 -0
format/tests/test_plink_fam.py +121 -0
format/tests/test_plink_lgen.py +106 -0
format/tests/test_plink_map.py +42 -0
format/tests/test_plink_ped.py +136 -0
format/tests/test_snp.py +128 -0
parentage/__init__.py +15 -0
parentage/_discov.py +102 -0
parentage/_isagmark.py +15 -0
parentage/_verif.py +91 -0
parentage/tests/__init__.py +7 -0
parentage/tests/test_discov.py +164 -0
parentage/tests/test_verif.py +160 -0
snplib-1.0.0.dist-info/LICENSE +674 -0
snplib-1.0.0.dist-info/METADATA +89 -0
snplib-1.0.0.dist-info/RECORD +36 -0
snplib-1.0.0.dist-info/WHEEL +5 -0
snplib-1.0.0.dist-info/top_level.txt +4 -0
statistics/__init__.py +16 -0
statistics/_callrate.py +59 -0
statistics/_freq.py +67 -0
statistics/_snphwe.py +132 -0
statistics/tests/__init__.py +7 -0
statistics/tests/test_callrate.py +171 -0
statistics/tests/test_freq_allele.py +87 -0
statistics/tests/test_freq_maf.py +17 -0
statistics/tests/test_hwe_t.py +41 -0
statistics/tests/test_snphwe.py +41 -0

format/tests/test_plink_ped.py ADDED Viewed

@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from . import DIR_FILES
+from .. import make_ped
+import pytest
+import pandas as pd
+@pytest.fixture
+def data_ped(request) -> pd.DataFrame | None:
+	return pd.read_pickle(DIR_FILES / f"fplink/ped/{request.param}")
+class TestPlinkFormatPed(object):
+	@pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
+	def test_ped_true(self, data_ped: pd.DataFrame) -> None:
+		assert not make_ped(
+			data_ped,
+			"SAMPLE_ID",
+			"SNP",
+			fid_col="SAMPLE_ID"
+		).empty
+		assert not make_ped(
+			data_ped,
+			"SAMPLE_ID",
+			"SNP"
+		).empty
+	def test_ped_empty(self) -> None:
+		assert make_ped(
+			pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
+			"SAMPLE_ID",
+			"SNP"
+		).empty
+		assert make_ped(
+			pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
+			"SAMPLE_ID",
+			"SNP",
+			fid_col="SAMPLE_ID"
+		).empty
+	@pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
+	def test_ped_raise_columns(self, data_ped: pd.DataFrame) -> None:
+		# SID_COL
+		with pytest.raises(
+				KeyError, match="Data has not in name columns!"
+		):
+			make_ped(
+				data=data_ped,
+				sid_col="SAMPLE_ID1",
+				fid_col="SAMPLE_ID",
+				snp_col="SNP"
+			)
+		# SNP_COL
+		with pytest.raises(
+				KeyError, match="Data has not in name columns!"
+		):
+			make_ped(
+				data_ped,
+				"SAMPLE_ID",
+				"SNP1",
+				fid_col="SAMPLE_ID"
+			)
+		# FID_COL
+		with pytest.raises(
+				KeyError, match="Data has not in name columns SAMPLE_ID1!"
+		):
+			make_ped(
+				data_ped,
+				"SAMPLE_ID",
+				"SNP",
+				fid_col="SAMPLE_ID1"
+			)
+	@pytest.mark.parametrize("data_ped", ["file2.pl"], indirect=True)
+	def test_ped_raises_underscope_sid(self, data_ped: pd.DataFrame) -> None:
+		# SID_COL
+		with pytest.raises(
+				Exception,
+				match="Replace in 'Sample ID' columns '_' on another a simbols"
+		):
+			res = make_ped(
+				data_ped,
+				"SAMPLE_ID",
+				"SNP"
+			)
+	@pytest.mark.parametrize("data_ped", ["file3.pl"], indirect=True)
+	def test_ped_raises_underscope_fid(self, data_ped: pd.DataFrame) -> None:
+		# FID_COL
+		with pytest.raises(
+				Exception,
+				match="Replace in 'Family ID' columns '_' on another a simbols"
+		):
+			res = make_ped(
+				data_ped,
+				"SAMPLE_ID",
+				"SNP",
+				fid_col="FAMILY_ID"
+			)
+	@pytest.mark.parametrize("data_ped", ["file4.pl"], indirect=True)
+	def test_ped_check_data(self, data_ped: pd.DataFrame) -> None:
+		res = make_ped(
+			data_ped,
+			"SAMPLE_ID",
+			"SNP",
+			fid_col="FAMILY_ID",
+			father_col="father",
+			mother_col="mother",
+			sex_col="sex"
+		)
+		res2 = make_ped(
+			data_ped,
+			"SAMPLE_ID",
+			"SNP",
+			fid_col="FAMILY_ID",
+		)
+		assert all(res.father.values == list('1234'))
+		assert all(res.mother.values == list('5678'))
+		assert all(res.sex.values == list('1210'))
+		assert all(res2.father.values == list('0000'))
+		assert all(res2.mother.values == list('0000'))
+		assert all(res2.sex.values == list('0000'))

format/tests/test_snp.py ADDED Viewed

@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from . import DIR_FILES
+from .. import Snp
+import pytest
+import pandas as pd
+@pytest.fixture
+def data_fr(request) -> pd.DataFrame:
+	return pd.read_csv(DIR_FILES / f"fsnp/{request.param}", sep="\t")
+@pytest.fixture
+def obj_snp(request) -> Snp:
+	return Snp(fmt=request.param)
+class TestSNP(object):
+	@pytest.mark.parametrize(
+		"obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
+	)
+	def test_snp_process_uga_true(
+			self, obj_snp: Snp, data_fr: pd.DataFrame
+	) -> None:
+		obj_snp.process(data_fr)
+		assert obj_snp.data is not None and not obj_snp.data.empty
+		assert obj_snp.data.SNP.isin([
+			'02011015010000500', '01110152120222512'
+		]).all()
+	@pytest.mark.parametrize("obj_snp", ["uga"], indirect=True)
+	def test_snp_process_uga_empty(self, obj_snp: Snp) -> None:
+		obj_snp.process(pd.DataFrame(columns=[
+			'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
+			'GC Score', 'GT Score'
+		]))
+		assert obj_snp.data is not None and obj_snp.data.empty
+	@pytest.mark.parametrize(
+		"obj_snp, data_fr",
+		[("uga", 'file1.txt'), (None, 'file1.txt')],
+		indirect=True
+	)
+	def test_snp_process_raises(
+			self, obj_snp: Snp, data_fr: pd.DataFrame
+	) -> None:
+		with pytest.raises(KeyError):
+			obj_snp.process(pd.DataFrame(columns=[
+				'SNP Name1', 'Sample ID1', 'Allele1 - AB', 'Allele2 - AB',
+				'GC Score', 'GT Score'
+			]))
+		assert obj_snp.data is None
+	@pytest.mark.parametrize(
+		"obj_snp, data_fr", [(None, 'file1.txt')], indirect=True
+	)
+	def test_snp_process_df(
+			self, obj_snp: Snp, data_fr: pd.DataFrame
+	) -> None:
+		obj_snp.process(data_fr)
+		assert obj_snp.data is not None and not obj_snp.data.empty
+	@pytest.mark.parametrize("obj_snp", [None], indirect=True)
+	def test_snp_process_df_empty(self, obj_snp: Snp) -> None:
+		obj_snp.process(pd.DataFrame(columns=[
+			'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
+			'GC Score', 'GT Score'
+		]))
+		assert obj_snp.data is not None and obj_snp.data.empty
+	@pytest.mark.parametrize(
+		"obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
+	)
+	def test_snp_to_file_uga1(
+			self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
+	) -> None:
+		"""
+		The name sample_id is one length
+		"""
+		_dir_sub = tmp_path / "sub"
+		_dir_sub.mkdir()
+		_file_save = _dir_sub / "data_snp.csv"
+		obj_snp.process(data_fr)
+		assert obj_snp.data is not None and not obj_snp.data.empty
+		obj_snp.to_file(_file_save)
+		assert _file_save.is_file() and _file_save.exists()
+		assert (
+			_file_save.read_text() ==
+			"14814 02011015010000500\n14815 01110152120222512\n"
+		)
+	@pytest.mark.parametrize(
+		"obj_snp, data_fr", [("uga", 'file2.txt')], indirect=True
+	)
+	def test_snp_to_file_uga2(
+			self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
+	) -> None:
+		"""
+		The name sample_id of different length
+		"""
+		_dir_sub = tmp_path / "sub"
+		_dir_sub.mkdir()
+		_file_save = _dir_sub / "data_snp.csv"
+		obj_snp.process(data_fr)
+		assert obj_snp.data is not None and not obj_snp.data.empty
+		obj_snp.to_file(_file_save)
+		assert _file_save.is_file() and _file_save.exists()
+		assert (
+			_file_save.read_text() ==
+			"14814qwert 02011015010000500\n14815      01110152120222512\n"
+		)

parentage/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from ._discov import Discovery
+from ._verif import Verification
+from ._isagmark import isag_verif, isag_disc
+__all__ = [
+	"Discovery",
+	"Verification",
+	"isag_disc",
+	"isag_verif"
+]

parentage/_discov.py ADDED Viewed

@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+import pandas as pd
+"""
+Search for paternity according to ICAR recommendations
+https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
+"""
+class Discovery(object):
+    """ Search for paternity according to ICAR recommendations
+    :argument isag_markers: Fixed sample of markers to confirm paternity.
+    """
+    def __init__(
+            self, isag_markers: pd.Series | list | set | None = None
+    ) -> None:
+        self.__isag_markers = isag_markers
+        self.__num_conflicts = None  # Number of conflicts
+        self.__perc_conflicts = None
+    @property
+    def status(self) -> None | str:
+        """ The status of each parent discovered. """
+        if self.__perc_conflicts is not None:
+            if 0 <= self.__perc_conflicts < 1:
+                return 'Discovered'
+            elif 1 < self.__perc_conflicts < 3:
+                return 'Doubtful'
+            elif self.__perc_conflicts >= 3:
+                return 'Excluded'
+            else:
+                return None
+    @property
+    def num_conflicts(self) -> None | int:
+        return self.__num_conflicts
+    @property
+    def perc_conflicts(self) -> None | float:
+        return self.__perc_conflicts
+    def search_parent(
+            self,
+            data: pd.DataFrame,
+            descendant: str,
+            parents: str,
+            snp_name_col: str
+    ) -> None:
+        """ Search for paternity.
+        :param data: SNP data for descendant and parent.
+        :param descendant: Columns name of the descendant in the data.
+        :param parents: Columns name or list name of the parents in the data.
+        :param snp_name_col: SNP columns name is data.
+        """
+        if self.__isag_markers is None:
+            raise ValueError("Error. No array of snp names to verify")
+        sample_by_markers = data.loc[
+            data[snp_name_col].isin(self.__isag_markers),
+            [snp_name_col, descendant, parents]
+        ]
+        # Filtering 5s from a descendent
+        desc_marks = sample_by_markers.loc[
+            sample_by_markers[descendant] != 5, [snp_name_col, descendant]
+        ]
+        # According to ICAR, the number of available markers must be
+        # above 450
+        if len(desc_marks) < 450:
+            raise Exception("Calf call rate is low.")
+        # Common after filtering markers of potential ancestors
+        sample_parents = sample_by_markers.loc[
+            sample_by_markers[snp_name_col].isin(desc_marks[snp_name_col]),
+            parents
+        ]
+        # Number of available markers in potential ancestors
+        prob_parents_same_n_markers = (sample_parents < 5).sum()
+        # number of conflicts
+        self.__num_conflicts = (
+            abs(sample_parents.sub(desc_marks[descendant], axis=0)) == 2
+        ).sum()
+        # Percentage of conflicts
+        self.__perc_conflicts = (
+            (self.__num_conflicts / prob_parents_same_n_markers) * 100
+        ).round(2)
+    def __status_define(self) -> None:
+        ...

parentage/_isagmark.py ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from pathlib import Path
+import pandas as pd
+def isag_disc() -> pd.DataFrame:
+	return pd.read_pickle(Path(__file__).parent.joinpath("isag_disc.pl"))
+def isag_verif() -> pd.DataFrame:
+	return pd.read_pickle(Path(__file__).parent.joinpath("isag_verif.pl"))

parentage/_verif.py ADDED Viewed

@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+import numpy as np
+import pandas as pd
+"""
+https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
+"""
+class Verification(object):
+    """
+    Verification of paternity according to ICAR recommendations.
+    :argument isag_marks: Fixed sample of markers to confirm paternity.
+    """
+    def __init__(
+            self, isag_marks: pd.Series | list | set | None = None
+    ) -> None:
+        self.__isag_marks = isag_marks
+        # The minimum number of SNP available in the profile
+        # of each animal and potential parent must be scaled (i.e.: 95%
+        # truncated down)
+        self.__min_num_snp = 0.95
+        self.__num_conflicts = None  # Number of conflicts
+    @property
+    def status(self) -> None | str:
+        if self.__num_conflicts is not None:
+            if self.__num_conflicts <= 2:
+                return 'Accept'
+            elif 3 <= self.__num_conflicts <= 5:
+                return 'Doubtful'
+            elif self.__num_conflicts > 5:
+                return 'Excluded'
+            else:
+                return None
+    @property
+    def num_conflicts(self) -> None | int:
+        return self.__num_conflicts
+    def check_on(
+            self,
+            data: pd.DataFrame,
+            descendant: str,
+            parent: str,
+            snp_name_col: str
+    ) -> None:
+        """ Verification of paternity according to ICAR recommendations.
+        :param data: SNP data for descendant and parent.
+        :param descendant: Columns name of the descendant in the data.
+        :param parent: Columns name of the parent in the data.
+        :param snp_name_col: SNP column name in data.
+        """
+        if self.__isag_marks is None:
+            raise ValueError('Error. No array of snp names to verify')
+        num_isag_mark = len(self.__isag_marks)
+        min_num_comm_snp = int(num_isag_mark - (2 * (num_isag_mark * 0.05)))
+        sample_mark = data.loc[
+            data[snp_name_col].isin(self.__isag_marks), [descendant, parent]
+        ]
+        # The number of markers is not 5ok
+        desc_n_markers = (sample_mark[descendant] < 5).sum()
+        parent_n_markers = (sample_mark[parent] < 5).sum()
+        # According to ICAR, the number of markers not 5ok should be more
+        # than 95%
+        if (desc_n_markers < num_isag_mark * self.__min_num_snp) and \
+                (parent_n_markers < num_isag_mark * self.__min_num_snp):
+            raise Exception('Calf and parent have low call rate')
+        comm_snp_no_missing = sample_mark.replace(5, np.nan).dropna()
+        num_comm_markers = len(comm_snp_no_missing)
+        if num_comm_markers < min_num_comm_snp:
+            raise Exception('Pair call rate is low')
+        self.__num_conflicts = (abs(
+            comm_snp_no_missing[descendant] - comm_snp_no_missing[parent]
+        ) == 2).sum()

parentage/tests/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from pathlib import Path
+DIR_DATA = Path(__file__).parent.joinpath("data")

parentage/tests/test_discov.py ADDED Viewed

@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from . import DIR_DATA
+from .. import Discovery, isag_disc
+import pytest
+import pandas as pd
+@pytest.fixture
+def data() -> pd.DataFrame:
+	return pd.read_csv(DIR_DATA / "parentage_test_disc.csv", sep=" ")
+@pytest.fixture
+def obj_discovery() -> Discovery:
+	return Discovery(isag_markers=isag_disc().markers)
+class TestDiscovery(object):
+	def test_search_parent_successfully(
+		self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		assert obj_discovery.search_parent(
+			data=data,
+			descendant="BY000041988163",
+			parents="EE10512586",
+			snp_name_col="SNP_Name"
+		) is None
+		assert obj_discovery.num_conflicts == 77
+		assert obj_discovery.status == "Excluded"
+		assert obj_discovery.perc_conflicts == 14.86
+	def test_search_parent_1(self, data: pd.DataFrame) -> None:
+		"""
+		An exception is thrown for the absence of data with isag markers
+		"""
+		obj_discovery = Discovery()
+		with pytest.raises(
+			ValueError, match="Error. No array of snp names to verify"
+		):
+			obj_discovery.search_parent(
+				data=data,
+				descendant="BY000041988163",
+				parents="EE10512586",
+				snp_name_col="SNP_Name"
+			)
+		assert obj_discovery.status is None
+		assert obj_discovery.num_conflicts is None
+		assert obj_discovery.perc_conflicts is None
+	def test_search_parent_2(
+		self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		"""
+		Exception when the number of markers required to confirm paternity is
+		less than the established value.
+		"""
+		with pytest.raises(
+			Exception, match="Calf call rate is low."
+		):
+			obj_discovery.search_parent(
+				data=data[:-100],
+				descendant="BY000041988163",
+				parents="EE10512586",
+				snp_name_col="SNP_Name"
+			)
+		assert obj_discovery.status is None
+		assert obj_discovery.num_conflicts is None
+		assert obj_discovery.perc_conflicts is None
+	def test_search_parent_3(
+		self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		"""
+		Test if the transmitted animal names are not in the dataframe.
+		"""
+		# For descendant
+		with pytest.raises(KeyError):
+			obj_discovery.search_parent(
+				data=data,
+				descendant="BY00004198816",
+				parents="EE10512586",
+				snp_name_col="SNP_Name"
+			)
+		assert obj_discovery.status is None
+		assert obj_discovery.num_conflicts is None
+		assert obj_discovery.perc_conflicts is None
+		# For parents
+		with pytest.raises(KeyError):
+			obj_discovery.search_parent(
+				data=data,
+				descendant="BY000041988163",
+				parents="EE105125864",
+				snp_name_col="SNP_Name"
+			)
+		assert obj_discovery.status is None
+		assert obj_discovery.num_conflicts is None
+		assert obj_discovery.perc_conflicts is None
+	def test_search_parent_4(
+		self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		"""
+		Test when all snp data is not read - equal to 5.
+		"""
+		data[["BY000041988163", "EE10512586"]] = 5
+		with pytest.raises(
+			Exception, match="Calf call rate is low."
+		):
+			obj_discovery.search_parent(
+				data=data,
+				descendant="BY000041988163",
+				parents="EE10512586",
+				snp_name_col="SNP_Name"
+			)
+		assert obj_discovery.status is None
+		assert obj_discovery.num_conflicts is None
+		assert obj_discovery.perc_conflicts is None
+	def test_search_parent_5(
+			self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		"""
+		Test when there is a complete match.
+		"""
+		data[["BY000041988163", "EE10512586"]] = 2
+		obj_discovery.search_parent(
+			data=data,
+			descendant="BY000041988163",
+			parents="EE10512586",
+			snp_name_col="SNP_Name"
+		)
+		assert obj_discovery.status == "Discovered"
+		assert obj_discovery.num_conflicts == 0
+		assert obj_discovery.perc_conflicts == 0.0
+	def test_search_parent_6(
+			self, data: pd.DataFrame, obj_discovery: Discovery
+	) -> None:
+		"""
+		Partial match test.
+		"""
+		data.loc[202:, "EE10512586"] = 1
+		obj_discovery.search_parent(
+			data=data,
+			descendant="BY000041988163",
+			parents="EE10512586",
+			snp_name_col="SNP_Name"
+		)
+		assert obj_discovery.status == "Doubtful"
+		assert obj_discovery.num_conflicts == 14
+		assert obj_discovery.perc_conflicts == 2.70