snplib 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. snplib/__init__.py +8 -8
  2. snplib/finalreport/__init__.py +7 -7
  3. snplib/finalreport/_finalreport.py +251 -251
  4. snplib/format/__init__.py +19 -19
  5. snplib/format/__settings.py +7 -7
  6. snplib/format/_plink.py +305 -305
  7. snplib/format/_snp.py +113 -113
  8. snplib/parentage/__init__.py +15 -15
  9. snplib/parentage/_discov.py +102 -102
  10. snplib/parentage/_isagmark.py +15 -15
  11. snplib/parentage/_verif.py +91 -91
  12. snplib/parentage/isag_disc.pl +0 -0
  13. snplib/parentage/isag_verif.pl +0 -0
  14. snplib/statistics/__init__.py +16 -16
  15. snplib/statistics/_callrate.py +59 -59
  16. snplib/statistics/_freq.py +67 -67
  17. snplib/statistics/_snphwe.py +132 -132
  18. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/LICENSE +674 -674
  19. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/METADATA +80 -97
  20. snplib-1.0.8.dist-info/RECORD +22 -0
  21. snplib/finalreport/tests/__init__.py +0 -7
  22. snplib/finalreport/tests/test_finalreport.py +0 -215
  23. snplib/format/tests/__init__.py +0 -7
  24. snplib/format/tests/test_plink_fam.py +0 -121
  25. snplib/format/tests/test_plink_lgen.py +0 -106
  26. snplib/format/tests/test_plink_map.py +0 -42
  27. snplib/format/tests/test_plink_ped.py +0 -136
  28. snplib/format/tests/test_snp.py +0 -128
  29. snplib/parentage/tests/__init__.py +0 -7
  30. snplib/parentage/tests/test_discov.py +0 -164
  31. snplib/parentage/tests/test_verif.py +0 -160
  32. snplib/statistics/tests/__init__.py +0 -7
  33. snplib/statistics/tests/test_callrate.py +0 -171
  34. snplib/statistics/tests/test_freq_allele.py +0 -87
  35. snplib/statistics/tests/test_freq_maf.py +0 -17
  36. snplib/statistics/tests/test_hwe_t.py +0 -41
  37. snplib/statistics/tests/test_snphwe.py +0 -41
  38. snplib-1.0.7.dist-info/RECORD +0 -37
  39. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/WHEEL +0 -0
  40. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/top_level.txt +0 -0
@@ -1,91 +1,91 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- import numpy as np
6
- import pandas as pd
7
-
8
-
9
- """
10
- https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
11
- """
12
-
13
-
14
- class Verification(object):
15
- """
16
- Verification of paternity according to ICAR recommendations.
17
-
18
- :argument isag_marks: Fixed sample of markers to confirm paternity.
19
- """
20
-
21
- def __init__(
22
- self, isag_marks: pd.Series | list | set | None = None
23
- ) -> None:
24
- self.__isag_marks = isag_marks
25
-
26
- # The minimum number of SNP available in the profile
27
- # of each animal and potential parent must be scaled (i.e.: 95%
28
- # truncated down)
29
- self.__min_num_snp = 0.95
30
- self.__num_conflicts = None # Number of conflicts
31
-
32
- @property
33
- def status(self) -> None | str:
34
- if self.__num_conflicts is not None:
35
- if self.__num_conflicts <= 2:
36
- return 'Accept'
37
- elif 3 <= self.__num_conflicts <= 5:
38
- return 'Doubtful'
39
- elif self.__num_conflicts > 5:
40
- return 'Excluded'
41
- else:
42
- return None
43
-
44
- @property
45
- def num_conflicts(self) -> None | int:
46
- return self.__num_conflicts
47
-
48
- def check_on(
49
- self,
50
- data: pd.DataFrame,
51
- descendant: str,
52
- parent: str,
53
- snp_name_col: str
54
- ) -> None:
55
- """ Verification of paternity according to ICAR recommendations.
56
-
57
- :param data: SNP data for descendant and parent.
58
- :param descendant: Columns name of the descendant in the data.
59
- :param parent: Columns name of the parent in the data.
60
- :param snp_name_col: SNP column name in data.
61
- """
62
-
63
- if self.__isag_marks is None:
64
- raise ValueError('Error. No array of snp names to verify')
65
-
66
- num_isag_mark = len(self.__isag_marks)
67
- min_num_comm_snp = int(num_isag_mark - (2 * (num_isag_mark * 0.05)))
68
-
69
- sample_mark = data.loc[
70
- data[snp_name_col].isin(self.__isag_marks), [descendant, parent]
71
- ]
72
-
73
- # The number of markers is not 5ok
74
- desc_n_markers = (sample_mark[descendant] < 5).sum()
75
- parent_n_markers = (sample_mark[parent] < 5).sum()
76
-
77
- # According to ICAR, the number of markers not 5ok should be more
78
- # than 95%
79
- if (desc_n_markers < num_isag_mark * self.__min_num_snp) and \
80
- (parent_n_markers < num_isag_mark * self.__min_num_snp):
81
- raise Exception('Calf and parent have low call rate')
82
-
83
- comm_snp_no_missing = sample_mark.replace(5, np.nan).dropna()
84
- num_comm_markers = len(comm_snp_no_missing)
85
-
86
- if num_comm_markers < min_num_comm_snp:
87
- raise Exception('Pair call rate is low')
88
-
89
- self.__num_conflicts = (abs(
90
- comm_snp_no_missing[descendant] - comm_snp_no_missing[parent]
91
- ) == 2).sum()
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ """
10
+ https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
11
+ """
12
+
13
+
14
+ class Verification(object):
15
+ """
16
+ Verification of paternity according to ICAR recommendations.
17
+
18
+ :argument isag_marks: Fixed sample of markers to confirm paternity.
19
+ """
20
+
21
+ def __init__(
22
+ self, isag_marks: pd.Series | list | set | None = None
23
+ ) -> None:
24
+ self.__isag_marks = isag_marks
25
+
26
+ # The minimum number of SNP available in the profile
27
+ # of each animal and potential parent must be scaled (i.e.: 95%
28
+ # truncated down)
29
+ self.__min_num_snp = 0.95
30
+ self.__num_conflicts = None # Number of conflicts
31
+
32
+ @property
33
+ def status(self) -> None | str:
34
+ if self.__num_conflicts is not None:
35
+ if self.__num_conflicts <= 2:
36
+ return 'Accept'
37
+ elif 3 <= self.__num_conflicts <= 5:
38
+ return 'Doubtful'
39
+ elif self.__num_conflicts > 5:
40
+ return 'Excluded'
41
+ else:
42
+ return None
43
+
44
+ @property
45
+ def num_conflicts(self) -> None | int:
46
+ return self.__num_conflicts
47
+
48
+ def check_on(
49
+ self,
50
+ data: pd.DataFrame,
51
+ descendant: str,
52
+ parent: str,
53
+ snp_name_col: str
54
+ ) -> None:
55
+ """ Verification of paternity according to ICAR recommendations.
56
+
57
+ :param data: SNP data for descendant and parent.
58
+ :param descendant: Columns name of the descendant in the data.
59
+ :param parent: Columns name of the parent in the data.
60
+ :param snp_name_col: SNP column name in data.
61
+ """
62
+
63
+ if self.__isag_marks is None:
64
+ raise ValueError('Error. No array of snp names to verify')
65
+
66
+ num_isag_mark = len(self.__isag_marks)
67
+ min_num_comm_snp = int(num_isag_mark - (2 * (num_isag_mark * 0.05)))
68
+
69
+ sample_mark = data.loc[
70
+ data[snp_name_col].isin(self.__isag_marks), [descendant, parent]
71
+ ]
72
+
73
+ # The number of markers is not 5ok
74
+ desc_n_markers = (sample_mark[descendant] < 5).sum()
75
+ parent_n_markers = (sample_mark[parent] < 5).sum()
76
+
77
+ # According to ICAR, the number of markers not 5ok should be more
78
+ # than 95%
79
+ if (desc_n_markers < num_isag_mark * self.__min_num_snp) and \
80
+ (parent_n_markers < num_isag_mark * self.__min_num_snp):
81
+ raise Exception('Calf and parent have low call rate')
82
+
83
+ comm_snp_no_missing = sample_mark.replace(5, np.nan).dropna()
84
+ num_comm_markers = len(comm_snp_no_missing)
85
+
86
+ if num_comm_markers < min_num_comm_snp:
87
+ raise Exception('Pair call rate is low')
88
+
89
+ self.__num_conflicts = (abs(
90
+ comm_snp_no_missing[descendant] - comm_snp_no_missing[parent]
91
+ ) == 2).sum()
Binary file
Binary file
@@ -1,16 +1,16 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._snphwe import hwe, hwe_test
6
- from ._callrate import call_rate
7
- from ._freq import allele_freq, minor_allele_freq
8
-
9
-
10
- __all__ = [
11
- "call_rate",
12
- "allele_freq",
13
- "minor_allele_freq",
14
- "hwe",
15
- "hwe_test"
16
- ]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snphwe import hwe, hwe_test
6
+ from ._callrate import call_rate
7
+ from ._freq import allele_freq, minor_allele_freq
8
+
9
+
10
+ __all__ = [
11
+ "call_rate",
12
+ "allele_freq",
13
+ "minor_allele_freq",
14
+ "hwe",
15
+ "hwe_test"
16
+ ]
@@ -1,59 +1,59 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- import pandas as pd
6
-
7
-
8
- def call_rate(
9
- data: pd.DataFrame | str,
10
- id_col: str = None,
11
- snp_col: str = None
12
- ) -> pd.DataFrame | float | None:
13
- """ The call rate for a given SNP is defined as the proportion of
14
- individuals in the study for which the corresponding SNP information is
15
- not missing. In the following example, we filter using a call rate of 95%,
16
- meaning we retain SNPs for which there is less than 5% missing data.
17
-
18
- Of the say, 54K markers in the chip, 50K have been genotyped for a
19
- particular animal, the “call rate animal” is 50K/54K=93%
20
- Of the say, 900 animals genotyped for marker CL635944_160.1, how many
21
- have actually been successfully read? Assume that 600 have been read, then
22
- the “call rate marker” is 600/900 = 67%
23
-
24
- :param data: Pre-processed data on which the call rate is calculated.
25
- :param id_col: The name of the column with the id of the animals or
26
- markers.
27
- :param snp_col: The name of the column with the snp sequence.
28
- :return: Return dataframe with call rates for each animal if a dataframe
29
- is transmitted. The number if the snp sequence is passed as a string.
30
- None if there were errors.
31
- """
32
-
33
- if isinstance(data, pd.DataFrame):
34
- try:
35
- if data[snp_col].dtype.hasobject:
36
- if not data[snp_col].str.isdigit().all():
37
- return None
38
-
39
- return data[[id_col, snp_col]].\
40
- groupby(by=id_col)[snp_col].\
41
- apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
42
- reset_index()
43
-
44
- return data[[id_col, snp_col]]. \
45
- groupby(by=id_col)[snp_col]. \
46
- apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
47
- reset_index()
48
-
49
- except Exception as e:
50
- raise e
51
-
52
- elif isinstance(data, str):
53
- if not data.isdigit():
54
- return None
55
-
56
- return round(1 - (data.count('5') / len(data)), 6)
57
-
58
- else:
59
- return None
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def call_rate(
9
+ data: pd.DataFrame | str,
10
+ id_col: str = None,
11
+ snp_col: str = None
12
+ ) -> pd.DataFrame | float | None:
13
+ """ The call rate for a given SNP is defined as the proportion of
14
+ individuals in the study for which the corresponding SNP information is
15
+ not missing. In the following example, we filter using a call rate of 95%,
16
+ meaning we retain SNPs for which there is less than 5% missing data.
17
+
18
+ Of the say, 54K markers in the chip, 50K have been genotyped for a
19
+ particular animal, the “call rate animal” is 50K/54K=93%
20
+ Of the say, 900 animals genotyped for marker CL635944_160.1, how many
21
+ have actually been successfully read? Assume that 600 have been read, then
22
+ the “call rate marker” is 600/900 = 67%
23
+
24
+ :param data: Pre-processed data on which the call rate is calculated.
25
+ :param id_col: The name of the column with the id of the animals or
26
+ markers.
27
+ :param snp_col: The name of the column with the snp sequence.
28
+ :return: Return dataframe with call rates for each animal if a dataframe
29
+ is transmitted. The number if the snp sequence is passed as a string.
30
+ None if there were errors.
31
+ """
32
+
33
+ if isinstance(data, pd.DataFrame):
34
+ try:
35
+ if data[snp_col].dtype.hasobject:
36
+ if not data[snp_col].str.isdigit().all():
37
+ return None
38
+
39
+ return data[[id_col, snp_col]].\
40
+ groupby(by=id_col)[snp_col].\
41
+ apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
42
+ reset_index()
43
+
44
+ return data[[id_col, snp_col]]. \
45
+ groupby(by=id_col)[snp_col]. \
46
+ apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
47
+ reset_index()
48
+
49
+ except Exception as e:
50
+ raise e
51
+
52
+ elif isinstance(data, str):
53
+ if not data.isdigit():
54
+ return None
55
+
56
+ return round(1 - (data.count('5') / len(data)), 6)
57
+
58
+ else:
59
+ return None
@@ -1,67 +1,67 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- import pandas as pd
6
-
7
-
8
- def allele_freq(
9
- data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
10
- ) -> pd.DataFrame | float | None:
11
- """ The allele frequency represents the incidence of a gene variant in a
12
- population.
13
-
14
- :param data: Data array.
15
- :param id_col: Columns with snp names.
16
- :param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
17
- :return: Return the alleles frequency.
18
- """
19
-
20
- if isinstance(data, pd.DataFrame):
21
- try:
22
- if data[seq_col].dtype.hasobject:
23
- if not data[seq_col].str.isdigit().all():
24
- return None
25
-
26
- return data.\
27
- loc[data[seq_col] != "5", [id_col, seq_col]]. \
28
- groupby(by=id_col)[seq_col]. \
29
- apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
30
- reset_index().\
31
- round(3)
32
-
33
- return data.\
34
- loc[data[seq_col] != 5, [id_col, seq_col]].\
35
- groupby(by=id_col)[seq_col].\
36
- apply(lambda x: x.sum() / (2 * x.count())).\
37
- reset_index().\
38
- round(3)
39
-
40
- except Exception as e:
41
- raise e
42
-
43
- elif isinstance(data, str):
44
- if not data.isdigit():
45
- return None
46
-
47
- sam_seq = tuple(
48
- map(int, filter(lambda x: x if x != "5" else None, data))
49
- )
50
- return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
51
-
52
- else:
53
- return None
54
-
55
-
56
- def minor_allele_freq(value: float) -> float:
57
- """ The minor allele frequency is therefore the frequency at which the
58
- minor allele occurs within a population.
59
-
60
- :param value: Allele frequency
61
- :return: Return the minor alleles frequency
62
- """
63
-
64
- if value > 0.5:
65
- return round(1 - value, 3)
66
-
67
- return round(value, 3)
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def allele_freq(
9
+ data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
10
+ ) -> pd.DataFrame | float | None:
11
+ """ The allele frequency represents the incidence of a gene variant in a
12
+ population.
13
+
14
+ :param data: Data array.
15
+ :param id_col: Columns with snp names.
16
+ :param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
17
+ :return: Return the alleles frequency.
18
+ """
19
+
20
+ if isinstance(data, pd.DataFrame):
21
+ try:
22
+ if data[seq_col].dtype.hasobject:
23
+ if not data[seq_col].str.isdigit().all():
24
+ return None
25
+
26
+ return data.\
27
+ loc[data[seq_col] != "5", [id_col, seq_col]]. \
28
+ groupby(by=id_col)[seq_col]. \
29
+ apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
30
+ reset_index().\
31
+ round(3)
32
+
33
+ return data.\
34
+ loc[data[seq_col] != 5, [id_col, seq_col]].\
35
+ groupby(by=id_col)[seq_col].\
36
+ apply(lambda x: x.sum() / (2 * x.count())).\
37
+ reset_index().\
38
+ round(3)
39
+
40
+ except Exception as e:
41
+ raise e
42
+
43
+ elif isinstance(data, str):
44
+ if not data.isdigit():
45
+ return None
46
+
47
+ sam_seq = tuple(
48
+ map(int, filter(lambda x: x if x != "5" else None, data))
49
+ )
50
+ return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
51
+
52
+ else:
53
+ return None
54
+
55
+
56
+ def minor_allele_freq(value: float) -> float:
57
+ """ The minor allele frequency is therefore the frequency at which the
58
+ minor allele occurs within a population.
59
+
60
+ :param value: Allele frequency
61
+ :return: Return the minor alleles frequency
62
+ """
63
+
64
+ if value > 0.5:
65
+ return round(1 - value, 3)
66
+
67
+ return round(value, 3)