snplib 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. snplib/__init__.py +8 -8
  2. snplib/finalreport/__init__.py +7 -7
  3. snplib/finalreport/_finalreport.py +251 -251
  4. snplib/format/__init__.py +19 -19
  5. snplib/format/__settings.py +7 -7
  6. snplib/format/_plink.py +305 -305
  7. snplib/format/_snp.py +113 -113
  8. snplib/parentage/__init__.py +15 -15
  9. snplib/parentage/_discov.py +102 -102
  10. snplib/parentage/_isagmark.py +15 -15
  11. snplib/parentage/_verif.py +91 -91
  12. snplib/parentage/isag_disc.pl +0 -0
  13. snplib/parentage/isag_verif.pl +0 -0
  14. snplib/statistics/__init__.py +16 -16
  15. snplib/statistics/_callrate.py +59 -59
  16. snplib/statistics/_freq.py +67 -67
  17. snplib/statistics/_snphwe.py +132 -132
  18. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/LICENSE +674 -674
  19. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/METADATA +80 -97
  20. snplib-1.0.8.dist-info/RECORD +22 -0
  21. snplib/finalreport/tests/__init__.py +0 -7
  22. snplib/finalreport/tests/test_finalreport.py +0 -215
  23. snplib/format/tests/__init__.py +0 -7
  24. snplib/format/tests/test_plink_fam.py +0 -121
  25. snplib/format/tests/test_plink_lgen.py +0 -106
  26. snplib/format/tests/test_plink_map.py +0 -42
  27. snplib/format/tests/test_plink_ped.py +0 -136
  28. snplib/format/tests/test_snp.py +0 -128
  29. snplib/parentage/tests/__init__.py +0 -7
  30. snplib/parentage/tests/test_discov.py +0 -164
  31. snplib/parentage/tests/test_verif.py +0 -160
  32. snplib/statistics/tests/__init__.py +0 -7
  33. snplib/statistics/tests/test_callrate.py +0 -171
  34. snplib/statistics/tests/test_freq_allele.py +0 -87
  35. snplib/statistics/tests/test_freq_maf.py +0 -17
  36. snplib/statistics/tests/test_hwe_t.py +0 -41
  37. snplib/statistics/tests/test_snphwe.py +0 -41
  38. snplib-1.0.7.dist-info/RECORD +0 -37
  39. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/WHEEL +0 -0
  40. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/top_level.txt +0 -0
snplib/format/_snp.py CHANGED
@@ -1,113 +1,113 @@
1
- # !/usr/bin/env python
2
- # coding: utf-8
3
-
4
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
5
-
6
- from pathlib import Path
7
- from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
8
-
9
- import pandas as pd
10
-
11
-
12
- class Snp(object):
13
- """ The process of converting genomic map data - FinalReport.txt obtained
14
- from Illumin. Recoding allele data into quantitative data, saving in the
15
- format necessary for calculating gblup on blupf90.
16
-
17
- :argument fmt: Data format to use snp in plink and blupf90. Default
18
- value "uga". """
19
-
20
- _ALLELE_CODE = {
21
- 'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
22
- }
23
-
24
- _FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
25
- _F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
26
-
27
- def __init__(self, fmt: str | None = "uga") -> None:
28
- self._format_data = fmt
29
- self.__data_snp = None
30
-
31
- @property
32
- def data(self) -> pd.DataFrame | None:
33
- return self.__data_snp
34
-
35
- def process(self, data: pd.DataFrame) -> None:
36
- """ Data processing and formatting. Calculation of statistical
37
- information
38
-
39
- :param data: Data from FinalReport file. Example:
40
- SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
41
- ABCA12 14814 A A 0.4048 0.8164
42
- ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
43
- ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
44
- ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
45
-
46
- :return: Returns true if the data was formatted successfully and
47
- statistical information was calculated, false if an error.
48
- """
49
-
50
- if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
51
- raise KeyError(
52
- 'The name of the fields does not match the finalreport.txt '
53
- 'file from Illumina'
54
- )
55
-
56
- self.__data_snp = data.rename(columns=MAP_FIELDS)
57
- self.__data_snp['SNP'] = \
58
- self.__data_snp[['ALLELE1', 'ALLELE2']].\
59
- sum(axis=1).\
60
- map(Snp._ALLELE_CODE)
61
-
62
- self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
63
-
64
- if self._format_data is not None and self._format_data == "uga":
65
- self.__data_snp = self._format_uga(
66
- self.__data_snp[['SAMPLE_ID', 'SNP']]
67
- )
68
-
69
- @staticmethod
70
- def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
71
- """ Data format to use snp in plink and blupf90. """
72
-
73
- return data.groupby(by='SAMPLE_ID').sum().reset_index()
74
-
75
- def to_file(self, file_path: str | Path) -> None:
76
- """ Saving data to a file.
77
-
78
- :param file_path: Path to file
79
- """
80
-
81
- if isinstance(file_path, str):
82
- file_path = Path(file_path)
83
-
84
- if self._format_data is not None and self._format_data == "uga":
85
-
86
- max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
87
-
88
- self.__data_snp.\
89
- apply(
90
- lambda x: " ".join([
91
- self._add_space(x.iloc[0], max_len), x.iloc[1]
92
- ]),
93
- axis=1
94
- ).\
95
- to_csv(file_path, index=False, header=False)
96
-
97
- self.__data_snp["SAMPLE_ID"] = \
98
- self.__data_snp["SAMPLE_ID"].str.strip()
99
-
100
- return None
101
-
102
- self.__data_snp.to_csv(file_path, sep=" ", index=False)
103
-
104
- @staticmethod
105
- def _add_space(value: str, max_len: int) -> str:
106
- """ Adding spaces up to the maximum length of the value in the
107
- sample_id data.
108
-
109
- :param value: Sample_id value
110
- :param max_len: Max len sample_id value
111
- :return: Return replacing value
112
- """
113
- return "".join([value, " " * (max_len - len(value))])
1
+ # !/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
5
+
6
+ from pathlib import Path
7
+ from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
8
+
9
+ import pandas as pd
10
+
11
+
12
+ class Snp(object):
13
+ """ The process of converting genomic map data - FinalReport.txt obtained
14
+ from Illumin. Recoding allele data into quantitative data, saving in the
15
+ format necessary for calculating gblup on blupf90.
16
+
17
+ :argument fmt: Data format to use snp in plink and blupf90. Default
18
+ value "uga". """
19
+
20
+ _ALLELE_CODE = {
21
+ 'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
22
+ }
23
+
24
+ _FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
25
+ _F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
26
+
27
+ def __init__(self, fmt: str | None = "uga") -> None:
28
+ self._format_data = fmt
29
+ self.__data_snp = None
30
+
31
+ @property
32
+ def data(self) -> pd.DataFrame | None:
33
+ return self.__data_snp
34
+
35
+ def process(self, data: pd.DataFrame) -> None:
36
+ """ Data processing and formatting. Calculation of statistical
37
+ information
38
+
39
+ :param data: Data from FinalReport file. Example:
40
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
41
+ ABCA12 14814 A A 0.4048 0.8164
42
+ ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
43
+ ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
44
+ ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
45
+
46
+ :return: Returns true if the data was formatted successfully and
47
+ statistical information was calculated, false if an error.
48
+ """
49
+
50
+ if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
51
+ raise KeyError(
52
+ 'The name of the fields does not match the finalreport.txt '
53
+ 'file from Illumina'
54
+ )
55
+
56
+ self.__data_snp = data.rename(columns=MAP_FIELDS)
57
+ self.__data_snp['SNP'] = \
58
+ self.__data_snp[['ALLELE1', 'ALLELE2']].\
59
+ sum(axis=1).\
60
+ map(Snp._ALLELE_CODE)
61
+
62
+ self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
63
+
64
+ if self._format_data is not None and self._format_data == "uga":
65
+ self.__data_snp = self._format_uga(
66
+ self.__data_snp[['SAMPLE_ID', 'SNP']]
67
+ )
68
+
69
+ @staticmethod
70
+ def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
71
+ """ Data format to use snp in plink and blupf90. """
72
+
73
+ return data.groupby(by='SAMPLE_ID').sum().reset_index()
74
+
75
+ def to_file(self, file_path: str | Path) -> None:
76
+ """ Saving data to a file.
77
+
78
+ :param file_path: Path to file
79
+ """
80
+
81
+ if isinstance(file_path, str):
82
+ file_path = Path(file_path)
83
+
84
+ if self._format_data is not None and self._format_data == "uga":
85
+
86
+ max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
87
+
88
+ self.__data_snp.\
89
+ apply(
90
+ lambda x: " ".join([
91
+ self._add_space(x.iloc[0], max_len), x.iloc[1]
92
+ ]),
93
+ axis=1
94
+ ).\
95
+ to_csv(file_path, index=False, header=False)
96
+
97
+ self.__data_snp["SAMPLE_ID"] = \
98
+ self.__data_snp["SAMPLE_ID"].str.strip()
99
+
100
+ return None
101
+
102
+ self.__data_snp.to_csv(file_path, sep=" ", index=False)
103
+
104
+ @staticmethod
105
+ def _add_space(value: str, max_len: int) -> str:
106
+ """ Adding spaces up to the maximum length of the value in the
107
+ sample_id data.
108
+
109
+ :param value: Sample_id value
110
+ :param max_len: Max len sample_id value
111
+ :return: Return replacing value
112
+ """
113
+ return "".join([value, " " * (max_len - len(value))])
@@ -1,15 +1,15 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._discov import Discovery
6
- from ._verif import Verification
7
- from ._isagmark import isag_verif, isag_disc
8
-
9
-
10
- __all__ = [
11
- "Discovery",
12
- "Verification",
13
- "isag_disc",
14
- "isag_verif"
15
- ]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._discov import Discovery
6
+ from ._verif import Verification
7
+ from ._isagmark import isag_verif, isag_disc
8
+
9
+
10
+ __all__ = [
11
+ "Discovery",
12
+ "Verification",
13
+ "isag_disc",
14
+ "isag_verif"
15
+ ]
@@ -1,102 +1,102 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- import pandas as pd
6
-
7
- """
8
- Search for paternity according to ICAR recommendations
9
- https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
10
- """
11
-
12
-
13
- class Discovery(object):
14
- """ Search for paternity according to ICAR recommendations
15
-
16
- :argument isag_markers: Fixed sample of markers to confirm paternity.
17
- """
18
-
19
- def __init__(
20
- self, isag_markers: pd.Series | list | set | None = None
21
- ) -> None:
22
- self.__isag_markers = isag_markers
23
-
24
- self.__num_conflicts = None # Number of conflicts
25
- self.__perc_conflicts = None
26
-
27
- @property
28
- def status(self) -> None | str:
29
- """ The status of each parent discovered. """
30
-
31
- if self.__perc_conflicts is not None:
32
- if 0 <= self.__perc_conflicts < 1:
33
- return 'Discovered'
34
- elif 1 < self.__perc_conflicts < 3:
35
- return 'Doubtful'
36
- elif self.__perc_conflicts >= 3:
37
- return 'Excluded'
38
- else:
39
- return None
40
-
41
- @property
42
- def num_conflicts(self) -> None | int:
43
- return self.__num_conflicts
44
-
45
- @property
46
- def perc_conflicts(self) -> None | float:
47
- return self.__perc_conflicts
48
-
49
- def search_parent(
50
- self,
51
- data: pd.DataFrame,
52
- descendant: str,
53
- parents: str,
54
- snp_name_col: str
55
- ) -> None:
56
- """ Search for paternity.
57
-
58
- :param data: SNP data for descendant and parent.
59
- :param descendant: Columns name of the descendant in the data.
60
- :param parents: Columns name or list name of the parents in the data.
61
- :param snp_name_col: SNP columns name is data.
62
- """
63
-
64
- if self.__isag_markers is None:
65
- raise ValueError("Error. No array of snp names to verify")
66
-
67
- sample_by_markers = data.loc[
68
- data[snp_name_col].isin(self.__isag_markers),
69
- [snp_name_col, descendant, parents]
70
- ]
71
-
72
- # Filtering 5s from a descendent
73
- desc_marks = sample_by_markers.loc[
74
- sample_by_markers[descendant] != 5, [snp_name_col, descendant]
75
- ]
76
-
77
- # According to ICAR, the number of available markers must be
78
- # above 450
79
- if len(desc_marks) < 450:
80
- raise Exception("Calf call rate is low.")
81
-
82
- # Common after filtering markers of potential ancestors
83
- sample_parents = sample_by_markers.loc[
84
- sample_by_markers[snp_name_col].isin(desc_marks[snp_name_col]),
85
- parents
86
- ]
87
-
88
- # Number of available markers in potential ancestors
89
- prob_parents_same_n_markers = (sample_parents < 5).sum()
90
-
91
- # number of conflicts
92
- self.__num_conflicts = (
93
- abs(sample_parents.sub(desc_marks[descendant], axis=0)) == 2
94
- ).sum()
95
-
96
- # Percentage of conflicts
97
- self.__perc_conflicts = (
98
- (self.__num_conflicts / prob_parents_same_n_markers) * 100
99
- ).round(2)
100
-
101
- def __status_define(self) -> None:
102
- ...
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+ """
8
+ Search for paternity according to ICAR recommendations
9
+ https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
10
+ """
11
+
12
+
13
+ class Discovery(object):
14
+ """ Search for paternity according to ICAR recommendations
15
+
16
+ :argument isag_markers: Fixed sample of markers to confirm paternity.
17
+ """
18
+
19
+ def __init__(
20
+ self, isag_markers: pd.Series | list | set | None = None
21
+ ) -> None:
22
+ self.__isag_markers = isag_markers
23
+
24
+ self.__num_conflicts = None # Number of conflicts
25
+ self.__perc_conflicts = None
26
+
27
+ @property
28
+ def status(self) -> None | str:
29
+ """ The status of each parent discovered. """
30
+
31
+ if self.__perc_conflicts is not None:
32
+ if 0 <= self.__perc_conflicts < 1:
33
+ return 'Discovered'
34
+ elif 1 < self.__perc_conflicts < 3:
35
+ return 'Doubtful'
36
+ elif self.__perc_conflicts >= 3:
37
+ return 'Excluded'
38
+ else:
39
+ return None
40
+
41
+ @property
42
+ def num_conflicts(self) -> None | int:
43
+ return self.__num_conflicts
44
+
45
+ @property
46
+ def perc_conflicts(self) -> None | float:
47
+ return self.__perc_conflicts
48
+
49
+ def search_parent(
50
+ self,
51
+ data: pd.DataFrame,
52
+ descendant: str,
53
+ parents: str,
54
+ snp_name_col: str
55
+ ) -> None:
56
+ """ Search for paternity.
57
+
58
+ :param data: SNP data for descendant and parent.
59
+ :param descendant: Columns name of the descendant in the data.
60
+ :param parents: Columns name or list name of the parents in the data.
61
+ :param snp_name_col: SNP columns name is data.
62
+ """
63
+
64
+ if self.__isag_markers is None:
65
+ raise ValueError("Error. No array of snp names to verify")
66
+
67
+ sample_by_markers = data.loc[
68
+ data[snp_name_col].isin(self.__isag_markers),
69
+ [snp_name_col, descendant, parents]
70
+ ]
71
+
72
+ # Filtering 5s from a descendent
73
+ desc_marks = sample_by_markers.loc[
74
+ sample_by_markers[descendant] != 5, [snp_name_col, descendant]
75
+ ]
76
+
77
+ # According to ICAR, the number of available markers must be
78
+ # above 450
79
+ if len(desc_marks) < 450:
80
+ raise Exception("Calf call rate is low.")
81
+
82
+ # Common after filtering markers of potential ancestors
83
+ sample_parents = sample_by_markers.loc[
84
+ sample_by_markers[snp_name_col].isin(desc_marks[snp_name_col]),
85
+ parents
86
+ ]
87
+
88
+ # Number of available markers in potential ancestors
89
+ prob_parents_same_n_markers = (sample_parents < 5).sum()
90
+
91
+ # number of conflicts
92
+ self.__num_conflicts = (
93
+ abs(sample_parents.sub(desc_marks[descendant], axis=0)) == 2
94
+ ).sum()
95
+
96
+ # Percentage of conflicts
97
+ self.__perc_conflicts = (
98
+ (self.__num_conflicts / prob_parents_same_n_markers) * 100
99
+ ).round(2)
100
+
101
+ def __status_define(self) -> None:
102
+ ...
@@ -1,15 +1,15 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from pathlib import Path
6
-
7
- import pandas as pd
8
-
9
-
10
- def isag_disc() -> pd.DataFrame:
11
- return pd.read_pickle(Path(__file__).parent.joinpath("isag_disc.pl"))
12
-
13
-
14
- def isag_verif() -> pd.DataFrame:
15
- return pd.read_pickle(Path(__file__).parent.joinpath("isag_verif.pl"))
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+
10
+ def isag_disc() -> pd.DataFrame:
11
+ return pd.read_pickle(Path(__file__).parent.joinpath("isag_disc.pl"))
12
+
13
+
14
+ def isag_verif() -> pd.DataFrame:
15
+ return pd.read_pickle(Path(__file__).parent.joinpath("isag_verif.pl"))