PyPI - snplib - Versions diffs - 1.0.0__py3-none-any.whl - Mend

snplib 1.0.0__py3-none-any.whl

Files changed (36) hide show

finalreport/__init__.py +7 -0
finalreport/_finalreport.py +251 -0
finalreport/tests/__init__.py +7 -0
finalreport/tests/test_finalreport.py +215 -0
format/__init__.py +19 -0
format/__settings.py +7 -0
format/_plink.py +305 -0
format/_snp.py +113 -0
format/tests/__init__.py +7 -0
format/tests/test_plink_fam.py +121 -0
format/tests/test_plink_lgen.py +106 -0
format/tests/test_plink_map.py +42 -0
format/tests/test_plink_ped.py +136 -0
format/tests/test_snp.py +128 -0
parentage/__init__.py +15 -0
parentage/_discov.py +102 -0
parentage/_isagmark.py +15 -0
parentage/_verif.py +91 -0
parentage/tests/__init__.py +7 -0
parentage/tests/test_discov.py +164 -0
parentage/tests/test_verif.py +160 -0
snplib-1.0.0.dist-info/LICENSE +674 -0
snplib-1.0.0.dist-info/METADATA +89 -0
snplib-1.0.0.dist-info/RECORD +36 -0
snplib-1.0.0.dist-info/WHEEL +5 -0
snplib-1.0.0.dist-info/top_level.txt +4 -0
statistics/__init__.py +16 -0
statistics/_callrate.py +59 -0
statistics/_freq.py +67 -0
statistics/_snphwe.py +132 -0
statistics/tests/__init__.py +7 -0
statistics/tests/test_callrate.py +171 -0
statistics/tests/test_freq_allele.py +87 -0
statistics/tests/test_freq_maf.py +17 -0
statistics/tests/test_hwe_t.py +41 -0
statistics/tests/test_snphwe.py +41 -0

finalreport/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from ._finalreport import FinalReport
+__all__ = ["FinalReport"]

finalreport/_finalreport.py ADDED Viewed

@@ -0,0 +1,251 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+__all__ = ("FinalReport",)
+from pathlib import Path
+from functools import reduce
+import re
+import pandas as pd
+class FinalReport(object):
+	""" File that contains SNP information.
+	:argument allele: A variant form of a single nucleotide polymorphism
+		(SNP), a specific polymorphic site or a whole gene detectable at
+		a locus.  Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
+	:argument sep: Delimiter to use. Default value: "\\t"
+	Example:
+		[Header]
+		GSGT Version	2.0.4
+		Processing Date	10/14/2021 4:02 PM
+		Content		BovineSNP50_v3_A1.bpm
+		Num SNPs	53218
+		Total SNPs	53218
+		Num Samples	3
+		Total Samples	3
+		[Data]
+		SNP Name  Sample ID  Allele1 - AB  Allele2 - AB  GC Score  GT Score
+		ABCA12	1	A	A	0.4048	0.8164
+		APAF1	1	B	B	0.9067	0.9155
+		...
+	"""
+	__PATTERN_HEADER = re.compile(r'(^\[Header\])')
+	__PATTERN_DATA = re.compile(r'(^\[Data\])')
+	def __init__(
+			self,
+			allele: str | list | None = None,
+			sep: str = "\t"
+	) -> None:
+		self._delimiter = sep
+		self._full_data = None
+		self.__header = {}
+		self.__snp_data = None
+		self.__allele = allele
+		self._map_rn = None
+	@property
+	def header(self) -> dict:
+		return self.__header
+	@property
+	def snp_data(self) -> pd.DataFrame | None:
+		return self.__snp_data
+	def handle(
+			self, file_rep: Path | str, conv_file: Path | str = None
+	) -> bool:
+		""" Processes the FinalReport.txt file. Highlights meta information
+		and data.
+		:param file_rep: The file FinalReport.txt or another name.
+		:param conv_file: The file that contains IDs of registration numbers
+			of animals.
+		:return: Returns true if file processing was successful, false if
+			there were errors.
+		"""
+		try:
+			if isinstance(file_rep, str):
+				file_rep = Path(file_rep)
+			if not file_rep.is_file() and not file_rep.exists():
+				return False
+			# Processing conversion file
+			if conv_file is not None:
+				if isinstance(conv_file, str):
+					conv_file = Path(conv_file)
+				if not conv_file.is_file() and not conv_file.exists():
+					return False
+				self.__convert_s_id(conv_file)
+			# Processing report file
+			if not self.read(file_rep):
+				return False
+			if self._full_data is None:
+				raise Exception("Not data in file FinalReport.txt")
+			self.__handler_header()
+			self.__handler_data()
+			if self._map_rn is not None:
+				self.__snp_data['Sample ID'] = \
+					self.__snp_data['Sample ID'].map(
+						dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
+					)
+		except Exception as e:
+			raise e
+		return True
+	def read(self, file_rep: Path) -> bool:
+		""" Reading data from the final_report file
+		:param file_rep: path, pointer to the file to be read.
+		:return: Returns true if the read was successful, false if it failed.
+		"""
+		try:
+			if len(data := file_rep.read_text()) != 0:
+				self._full_data = data.strip().split("\n")
+				return True
+			self._full_data = None
+		except Exception as e:
+			return False
+		return True
+	def __handler_header(self) -> None:
+		""" Processes data from a file, selects meta-information. """
+		for line in self._full_data:
+			if self.__class__.__PATTERN_DATA.findall(line):
+				return
+			if self.__class__.__PATTERN_HEADER.findall(line):
+				continue
+			key = line.strip().split("\t")[0]
+			value = line.strip().split("\t")[1]
+			self.__header[key] = value
+	def __handler_data(self) -> None:
+		""" Processes data and forms an array for further processing. """
+		temp = 1
+		for line in self._full_data:
+			if self.__class__.__PATTERN_DATA.findall(line):
+				break
+			temp += 1
+		names_col = self.__sample_by_allele(
+			self._full_data[temp].split(f"{self._delimiter}")
+		)
+		if names_col is None:
+			raise Exception(f"Error. Allele {self.__allele} not in data.")
+		self.__snp_data = pd.DataFrame(
+			[
+				item_data.split(f"{self._delimiter}")
+				for item_data in self._full_data[temp + 1:]
+			],
+			columns=self._full_data[temp].split(f"{self._delimiter}")
+		)[names_col]
+	def __sample_by_allele(self, names: list[str]) -> list[str] | None:
+		""" Method that generates a list of field names choosing which alleles
+		to keep
+		:param names: List of field names in the report file.
+		:return: Returns a filtered list of fields by alleles.
+		"""
+		allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
+		match self.__allele:
+			case None:
+				return names
+			case str():
+				allele_pattern = re.compile(
+					allele_templ.format(self.__allele)
+				)
+			case list() | tuple() | set():
+				allele_pattern = re.compile(
+					allele_templ.format("|".join(self.__allele))
+				)
+			case _:
+				return None
+		lst_allele = reduce(
+			lambda i, j: i + j,
+			[allele_pattern.findall(item) for item in names]
+		)
+		if len(lst_allele) == 0:
+			return None
+		exclude_alleles = [
+			item for item in names
+			if item.startswith("Allele") and item not in lst_allele
+		]
+		return list(filter(
+			lambda x: True if x not in exclude_alleles else False, names
+		))
+	def __convert_s_id(self, path_file: Path) -> None:
+		"""Converts sample id which is in FinalReport to animal registration
+		number.
+		:param path_file: xlsx file with animal numbers label
+		"""
+		self._map_rn = pd.read_excel(
+			path_file,
+			header=None,
+			names=['SID', 'UNIQ_KEY', 'SEX'],
+			dtype={'SID': str},
+			index_col=False
+		)
+		if self._map_rn.empty:
+			self._map_rn = None
+			return
+		self._map_rn.SID = self._map_rn.SID.str.strip()
+		self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
+		if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
+			raise Exception("Error. Unique keys contain Cyrillic alphabet.")
+		if self._map_rn.UNIQ_KEY.isna().any():
+			self._map_rn.fillna('unknown', inplace=True)
+	@staticmethod
+	def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
+		"""
+		:param seq:
+		:return:
+		"""
+		return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))

finalreport/tests/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from pathlib import Path
+DIR_FILES = Path(__file__).parent.joinpath("files")

finalreport/tests/test_finalreport.py ADDED Viewed

@@ -0,0 +1,215 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from . import DIR_FILES
+from .._finalreport import FinalReport
+import pytest
+@pytest.fixture
+def report(request) -> FinalReport:
+	return FinalReport(allele=request.param)
+class TestFinalReport(object):
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_1(self, report: FinalReport) -> None:
+		""" If both files do not exist """
+		assert not report.handle(
+			DIR_FILES / "fr/f.txt", DIR_FILES / "fr/f.xlsx",
+		)
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_2(self, report: FinalReport) -> None:
+		""" If the file to convert does not exist """
+		assert not report.handle(
+			DIR_FILES / "fr/file1.txt", DIR_FILES / "fr/f.xlsx",
+		)
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_3(self, report: FinalReport) -> None:
+		""" If the data does not contain header data """
+		report.handle(
+			DIR_FILES / "fr/file2.txt", DIR_FILES / "fr/file2.xlsx",
+		)
+		assert len(report.header) == 0 and not report.snp_data.empty
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_4(self, report: FinalReport) -> None:
+		""" If the file contains only header and field names """
+		report.handle(
+			DIR_FILES / "fr/file3.txt", DIR_FILES / "fr/file3.xlsx",
+		)
+		assert report.snp_data is not None and report.snp_data.empty
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_5(self, report: FinalReport) -> None:
+		""" If the data file is empty """
+		with pytest.raises(
+				Exception, match="Not data in file FinalReport.txt"
+		):
+			report.handle(
+				DIR_FILES / "fr/file5.txt", DIR_FILES / "fr/file5.xlsx",
+			)
+		assert report.snp_data is None
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_6(self, report: FinalReport) -> None:
+		""" If the conversion file is empty """
+		assert report.handle(
+			DIR_FILES / "fr/file6.txt", DIR_FILES / "fr/file6.xlsx",
+		)
+		assert not report.snp_data.empty
+		assert len(report.header) != 0
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_7(self, report: FinalReport) -> None:
+		""" If the data file is not needed to convert ID name """
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		assert not report.snp_data.empty
+		assert len(report.header) != 0
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_handle_8(self, report: FinalReport) -> None:
+		""" If files exist """
+		assert report.handle(
+			DIR_FILES / "fr/file1.txt", DIR_FILES / "fr/file1.xlsx",
+		)
+	@pytest.mark.parametrize("report", [None], indirect=True)
+	def test_allele_none(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Forward', 'Allele2 - Forward',
+			'Allele1 - Top', 'Allele2 - Top', 'Allele1 - AB', 'Allele2 - AB',
+			'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", ["AB"], indirect=True)
+	def test_sample_allele_ab(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
+			'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", ["Forward"], indirect=True)
+	def test_sample_allele_forward(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Forward', 'Allele2 - Forward',
+			'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", ["Top"], indirect=True)
+	def test_sample_allele_top(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
+			'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", [["AB", "Top"]], indirect=True)
+	def test_sample_allele_list1(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
+			'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", [["AB"]], indirect=True)
+	def test_sample_allele_list2(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
+			'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", [("AB", "Top")], indirect=True)
+	def test_sample_allele_tuple(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
+			'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", [{"AB", "Top"}], indirect=True)
+	def test_sample_allele_set(self, report: FinalReport) -> None:
+		report.handle(DIR_FILES / "fr/file4.txt", None)
+		_fields = [
+			'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
+			'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
+		]
+		assert report.snp_data.columns.difference(_fields).empty
+	@pytest.mark.parametrize("report", ["GG"], indirect=True)
+	def test_sample_allele_not_exist(self, report: FinalReport) -> None:
+		with pytest.raises(
+				Exception, match="Error. Allele GG not in data."
+		):
+			report.handle(DIR_FILES / "fr/file4.txt", None)
+	@pytest.mark.parametrize("report", ["AB"], indirect=True)
+	def test_7(self, report: FinalReport) -> None:
+		with pytest.raises(
+			Exception, match="Error. Unique keys contain Cyrillic alphabet."
+		):
+			report.handle(
+				DIR_FILES / "fr/file7.txt", DIR_FILES / "fr/file7.xlsx"
+			)
+	# 	assert not report.snp_data.empty
+	#
+	# @pytest.mark.parametrize("report", ["AB"], indirect=True)
+	# def test_8(self, report: FinalReport) -> None:
+	# 	...
+	#
+	# @pytest.mark.parametrize("report", ["AB"], indirect=True)
+	# def test_9(self, report: FinalReport) -> None:
+	# 	...
+	#
+	# @pytest.mark.parametrize("report", ["AB"], indirect=True)
+	# def test_10(self, report: FinalReport) -> None:
+	# 	...

format/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+from ._snp import Snp
+from ._plink import (
+	make_map,
+	make_ped,
+	make_lgen,
+	make_fam
+)
+__all__ = [
+	"Snp",
+	"make_map",
+	"make_ped",
+	"make_fam",
+	"make_lgen"
+]

format/__settings.py ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# coding: utf-8
+__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
+FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
+RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
+MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))