PyPI - snplib - Versions diffs - 1.0.10__py3-none-any.whl → 1.2.10__py3-none-any.whl - Mend

snplib 1.0.10py3-none-any.whl → 1.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

snplib/finalreport/_finalreport.py CHANGED Viewed

@@ -3,20 +3,27 @@
 __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
 __all__ = ("FinalReport",)
-from pathlib import Path
+import re
 from functools import reduce
+from pathlib import Path
-import re
 import pandas as pd
+from numpy import nan
 class FinalReport(object):
-	""" File that contains SNP information.
-	:argument allele: A variant form of a single nucleotide polymorphism
-		(SNP), a specific polymorphic site or a whole gene detectable at
-		a locus.  Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
-	:argument sep: Delimiter to use. Default value: "\\t"
+	""" File that contains SNP information. File processing is triggered by the
+	handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
+	conversion file, the processed data will contain NAN values.
+	:param allele: A variant form of a single nucleotide polymorphism (SNP), a
+		specific polymorphic site or a whole gene detectable at a locus. Type:
+		'AB', 'Forward', 'Top', 'Plus', 'Design'.
+	:param sep: Delimiter to use. Default value: "\\t".
+	:param usecols: Selection of fields for reading. Accelerates processing
+		and reduces memory.
+	:param dtype: Data type(s) to apply to either the whole dataset or
+		individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
 	Example:
 		[Header]
@@ -34,20 +41,34 @@ class FinalReport(object):
 		...
 	"""
-	__PATTERN_HEADER = re.compile(r'(^\[Header\])')
-	__PATTERN_DATA = re.compile(r'(^\[Data\])')
+	__PATTERN_HEADER = re.compile(r'(^\[Header])')
+	__PATTERN_DATA = re.compile(r'(^\[Data])')
+	__slots__ = (
+		"_delimiter",
+		"__allele",
+		"__usecols",
+		"__dtype",
+		"__snp_data",
+		"__header",
+		"_map_rn",
+	)
 	def __init__(
 			self,
 			allele: str | list | None = None,
+			usecols: list[str] | None = None,
+			dtype: dict | None = None,
 			sep: str = "\t"
 	) -> None:
 		self._delimiter = sep
-		self._full_data = None
+		self.__allele = allele
+		self.__usecols = usecols
+		self.__dtype = dtype
+		# self._full_data = None
+		self.__snp_data: pd.DataFrame | None = None
 		self.__header = {}
-		self.__snp_data = None
-		self.__allele = allele
 		self._map_rn = None
 	@property
@@ -73,6 +94,9 @@ class FinalReport(object):
 		try:
+			if self.__allele is not None and self.__usecols is not None:
+				raise Exception("Error. Usecols is used for allele is none.")
 			if isinstance(file_rep, str):
 				file_rep = Path(file_rep)
@@ -89,17 +113,11 @@ class FinalReport(object):
 				self.__convert_s_id(conv_file)
-			# Processing report file
-			if not self.read(file_rep):
-				return False
-			if self._full_data is None:
-				raise Exception("Not data in file FinalReport.txt")
-			self.__handler_header()
-			self.__handler_data()
+			# # Processing report file
+			self.__handler_header(file_rep)
+			self.__handler_data(file_rep)
-			if self._map_rn is not None:
+			if not self.__snp_data.empty and self._map_rn is not None:
 				self.__snp_data['Sample ID'] = \
 					self.__snp_data['Sample ID'].map(
 						dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
@@ -110,62 +128,99 @@ class FinalReport(object):
 		return True
-	def read(self, file_rep: Path) -> bool:
-		""" Reading data from the final_report file
+	def __handler_header(self, file_rep: Path) -> None:
+		""" Processes data from a file, selects meta-information.
 		:param file_rep: path, pointer to the file to be read.
-		:return: Returns true if the read was successful, false if it failed.
 		"""
-		try:
-			if len(data := file_rep.read_text()) != 0:
-				self._full_data = data.strip().split("\n")
-				return True
-			self._full_data = None
+		with open(file_rep, 'r') as file:
-		except Exception as e:
-			return False
+			for line in file:
+				if self.__class__.__PATTERN_DATA.findall(line.strip()):
+					return
-		return True
+				if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
+					len(line.strip()) == 0:
+					continue
-	def __handler_header(self) -> None:
-		""" Processes data from a file, selects meta-information. """
+				key = line.strip().split("\t")[0]
+				value = line.strip().split("\t")[1]
+				self.__header[key] = value
+	def __handler_data(self, file_rep: Path) -> None:
+		""" Processes data and forms an array for further processing.
+		:param file_rep: path, pointer to the file to be read.
+		"""
+		with open(file_rep, 'r') as file:
+			# Search for the data start index and skip
+			for line in file:
+				if self.__class__.__PATTERN_DATA.findall(line.strip()):
+					break
+			# line column
+			orig_name_col = file.readline().strip().split(self._delimiter)
+			if self.__allele is None and self.__usecols is None:
+				self.__snp_data = pd.read_csv(
+					file,
+					sep=self._delimiter,
+					header=None,
+					names=orig_name_col,
+					dtype=self.__dtype,
+					low_memory=True,
+					na_filter=True
+				)
-		for line in self._full_data:
-			if self.__class__.__PATTERN_DATA.findall(line):
 				return
-			if self.__class__.__PATTERN_HEADER.findall(line):
-				continue
+			sub_n_col = self.__processing_columns(orig_name_col)
+			self.__snp_data = pd.read_csv(
+				file,
+				sep=self._delimiter,
+				header=None,
+				names=orig_name_col,
+				usecols=sub_n_col,
+				dtype=self.__dtype,
+				low_memory=True,
+				na_filter=True
+			)
-			key = line.strip().split("\t")[0]
-			value = line.strip().split("\t")[1]
+			return
-			self.__header[key] = value
+	def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
+		""" Processing the line with all the names of the fields and the
+		sample of them.
-	def __handler_data(self) -> None:
-		""" Processes data and forms an array for further processing. """
+		:param lst_col: List of all fields.
+		:return: Returns a tuple with a list of names of selected fields.
+		"""
-		temp = 1
-		for line in self._full_data:
-			if self.__class__.__PATTERN_DATA.findall(line):
-				break
-			temp += 1
+		if self.__usecols is not None:
+			check_n_col = [
+				item for item in self.__usecols if item in lst_col
+			]
-		names_col = self.__sample_by_allele(
-			self._full_data[temp].split(f"{self._delimiter}")
-		)
+			# Check on empty list
+			if check_n_col:
+				return self.__usecols
-		if names_col is None:
-			raise Exception(f"Error. Allele {self.__allele} not in data.")
+			raise Exception(
+				f"Error. The USECOLS list contains not true fields."
+			)
-		self.__snp_data = pd.DataFrame(
-			[
-				item_data.split(f"{self._delimiter}")
-				for item_data in self._full_data[temp + 1:]
-			],
-			columns=self._full_data[temp].split(f"{self._delimiter}")
-		)[names_col]
+		# processing alleles
+		sample_n_col = self.__sample_by_allele(lst_col)
+		if sample_n_col is None:
+			raise Exception(
+				f"Error. Allele {self.__allele} not in data."
+			)
+		return sample_n_col
 	def __sample_by_allele(self, names: list[str]) -> list[str] | None:
 		""" Method that generates a list of field names choosing which alleles
@@ -235,17 +290,15 @@ class FinalReport(object):
 		if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
 			raise Exception("Error. Unique keys contain Cyrillic alphabet.")
-		if self._map_rn.UNIQ_KEY.isna().any():
-			self._map_rn.fillna('unknown', inplace=True)
 	@staticmethod
 	def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
-		"""
+		""" Checial verification of the Cyrillic
-		:param seq:
-		:return:
+		:param seq: Squeezed for verification.
+		:return: Truth if there are no symbols of Cyril and there is a lie if
+			there is.
 		"""
-		return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))
+		return seq.apply(
+			lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
+		).any()

{snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: snplib
-Version: 1.0.10
+Version: 1.2.10
 Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
 Author-email: Igor <igor.loschinin@gmail.com>
 License: GNU
@@ -10,14 +10,14 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy>=1.26.1
-Requires-Dist: pandas>=2.1.1
-Requires-Dist: six>=1.16.0
+Requires-Dist: numpy>=2.2.3
+Requires-Dist: pandas>=2.2.3
+Requires-Dist: six>=1.17.0
 Requires-Dist: swifter>=1.4.0
 Requires-Dist: xlrd>=2.0.1
-Requires-Dist: XlsxWriter>=3.1.9
-Requires-Dist: openpyxl>=3.1.2
-Requires-Dist: pydantic>=2.4.2
+Requires-Dist: XlsxWriter>=3.2.2
+Requires-Dist: openpyxl>=3.1.5
+Requires-Dist: pydantic>=2.10.6
 # snptools
 <p align="center">

{snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 snplib/__init__.py,sha256=xhjj4ZywdwCq91GBh1zfBP_TwFW26-KpHcCUUVvMdgI,196
 snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
-snplib/finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
+snplib/finalreport/_finalreport.py,sha256=_VXv8ayTIJBGGkpXYtrvBEp2HNuQ8Dh3zqS8HadlnHo,7501
 snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
 snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
 snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
@@ -15,8 +15,8 @@ snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw
 snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
 snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
 snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
-snplib-1.0.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-snplib-1.0.10.dist-info/METADATA,sha256=pKp9XmfbbogkNsdC_SEpsx1hLTdFxWaD9YgrIq7j0e4,2184
-snplib-1.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-snplib-1.0.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
-snplib-1.0.10.dist-info/RECORD,,
+snplib-1.2.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+snplib-1.2.10.dist-info/METADATA,sha256=ZjG7lspM2kiKEQZCTFHQDNY3yrU6zyfTnLBaYGAzRU4,2184
+snplib-1.2.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+snplib-1.2.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
+snplib-1.2.10.dist-info/RECORD,,

{snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

snplib 1.0.10__py3-none-any.whl → 1.2.10__py3-none-any.whl

snplib 1.0.10py3-none-any.whl → 1.2.10py3-none-any.whl