sai-pg 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +18 -0
- sai/__main__.py +73 -0
- sai/parsers/__init__.py +18 -0
- sai/parsers/argument_validation.py +169 -0
- sai/parsers/outlier_parser.py +76 -0
- sai/parsers/plot_parser.py +152 -0
- sai/parsers/score_parser.py +241 -0
- sai/sai.py +315 -0
- sai/stats/__init__.py +18 -0
- sai/stats/features.py +302 -0
- sai/utils/__init__.py +22 -0
- sai/utils/generators/__init__.py +23 -0
- sai/utils/generators/chunk_generator.py +148 -0
- sai/utils/generators/data_generator.py +49 -0
- sai/utils/generators/window_generator.py +250 -0
- sai/utils/genomic_dataclasses.py +46 -0
- sai/utils/multiprocessing/__init__.py +22 -0
- sai/utils/multiprocessing/mp_manager.py +251 -0
- sai/utils/multiprocessing/mp_pool.py +73 -0
- sai/utils/preprocessors/__init__.py +23 -0
- sai/utils/preprocessors/chunk_preprocessor.py +152 -0
- sai/utils/preprocessors/data_preprocessor.py +94 -0
- sai/utils/preprocessors/feature_preprocessor.py +211 -0
- sai/utils/utils.py +689 -0
- sai_pg-1.0.0.dist-info/METADATA +44 -0
- sai_pg-1.0.0.dist-info/RECORD +30 -0
- sai_pg-1.0.0.dist-info/WHEEL +5 -0
- sai_pg-1.0.0.dist-info/entry_points.txt +2 -0
- sai_pg-1.0.0.dist-info/licenses/LICENSE +674 -0
- sai_pg-1.0.0.dist-info/top_level.txt +1 -0
sai/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
sai/__main__.py
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
from sai.parsers.score_parser import add_score_parser
|
23
|
+
from sai.parsers.outlier_parser import add_outlier_parser
|
24
|
+
from sai.parsers.plot_parser import add_plot_parser
|
25
|
+
|
26
|
+
|
27
|
+
def _set_sigpipe_handler() -> None:
|
28
|
+
"""
|
29
|
+
Sets the signal handler for SIGPIPE signals on POSIX systems.
|
30
|
+
|
31
|
+
"""
|
32
|
+
import os
|
33
|
+
import signal
|
34
|
+
|
35
|
+
if os.name == "posix":
|
36
|
+
# Set signal handler for SIGPIPE to quietly kill the program.
|
37
|
+
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
|
38
|
+
|
39
|
+
|
40
|
+
def _sai_cli_parser() -> argparse.ArgumentParser:
|
41
|
+
"""
|
42
|
+
Initializes and configures the command-line interface parser
|
43
|
+
for sai.
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
top_parser : argparse.ArgumentParser
|
48
|
+
A configured command-line interface parser.
|
49
|
+
"""
|
50
|
+
top_parser = argparse.ArgumentParser()
|
51
|
+
subparsers = top_parser.add_subparsers(dest="subcommand")
|
52
|
+
subparsers.required = True
|
53
|
+
|
54
|
+
add_score_parser(subparsers)
|
55
|
+
add_outlier_parser(subparsers)
|
56
|
+
add_plot_parser(subparsers)
|
57
|
+
|
58
|
+
return top_parser
|
59
|
+
|
60
|
+
|
61
|
+
def main(arg_list: list = None) -> None:
|
62
|
+
"""
|
63
|
+
Main entry for sai.
|
64
|
+
|
65
|
+
Parameters
|
66
|
+
----------
|
67
|
+
arg_list : list, optional
|
68
|
+
A list containing arguments for sai. Default: None.
|
69
|
+
"""
|
70
|
+
_set_sigpipe_handler()
|
71
|
+
parser = _sai_cli_parser()
|
72
|
+
args = parser.parse_args(arg_list)
|
73
|
+
args.runner(args)
|
sai/parsers/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
@@ -0,0 +1,169 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
import os
|
23
|
+
import re
|
24
|
+
|
25
|
+
|
26
|
+
def positive_int(value: str) -> int:
|
27
|
+
"""
|
28
|
+
Validates if the provided string represents a positive integer.
|
29
|
+
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
value : str
|
33
|
+
The value to validate.
|
34
|
+
|
35
|
+
Returns
|
36
|
+
-------
|
37
|
+
int
|
38
|
+
The validated positive integer.
|
39
|
+
|
40
|
+
Raises
|
41
|
+
------
|
42
|
+
argparse.ArgumentTypeError
|
43
|
+
If the value is not a valid integer or positive integer.
|
44
|
+
"""
|
45
|
+
if value is not None:
|
46
|
+
try:
|
47
|
+
value = int(value)
|
48
|
+
except ValueError:
|
49
|
+
raise argparse.ArgumentTypeError(f"{value} is not a valid integer")
|
50
|
+
if value <= 0:
|
51
|
+
raise argparse.ArgumentTypeError(f"{value} is not a positive integer")
|
52
|
+
return value
|
53
|
+
|
54
|
+
|
55
|
+
def positive_number(value: str) -> float:
|
56
|
+
"""
|
57
|
+
Validates if the provided string represents a positive number.
|
58
|
+
|
59
|
+
Parameters
|
60
|
+
----------
|
61
|
+
value : str
|
62
|
+
The value to validate.
|
63
|
+
|
64
|
+
Returns
|
65
|
+
-------
|
66
|
+
float
|
67
|
+
The validated positive number.
|
68
|
+
|
69
|
+
Raises
|
70
|
+
------
|
71
|
+
argparse.ArgumentTypeError
|
72
|
+
If the value is not a valid number or positive number.
|
73
|
+
"""
|
74
|
+
if value is not None:
|
75
|
+
try:
|
76
|
+
value = float(value)
|
77
|
+
except ValueError:
|
78
|
+
raise argparse.ArgumentTypeError(f"{value} is not a valid number")
|
79
|
+
if value <= 0:
|
80
|
+
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
81
|
+
return value
|
82
|
+
|
83
|
+
|
84
|
+
def between_zero_and_one(value: str) -> float:
|
85
|
+
"""
|
86
|
+
Validates if the provided string represents a number between 0 and 1 (inclusive).
|
87
|
+
|
88
|
+
Parameters
|
89
|
+
----------
|
90
|
+
value : str
|
91
|
+
The value to validate.
|
92
|
+
|
93
|
+
Returns
|
94
|
+
-------
|
95
|
+
float
|
96
|
+
The validated number between 0 and 1.
|
97
|
+
|
98
|
+
Raises
|
99
|
+
------
|
100
|
+
argparse.ArgumentTypeError
|
101
|
+
If the value is not a valid number or is not between 0 and 1.
|
102
|
+
"""
|
103
|
+
if value is not None:
|
104
|
+
try:
|
105
|
+
value = float(value)
|
106
|
+
except ValueError:
|
107
|
+
raise argparse.ArgumentTypeError(f"{value} is not a valid number")
|
108
|
+
if not (0 <= value <= 1):
|
109
|
+
raise argparse.ArgumentTypeError(
|
110
|
+
f"{value} is not between 0 and 1 (inclusive)"
|
111
|
+
)
|
112
|
+
return value
|
113
|
+
|
114
|
+
|
115
|
+
def existed_file(value: str) -> str:
|
116
|
+
"""
|
117
|
+
Validates if the provided string is a path to an existing file.
|
118
|
+
|
119
|
+
Parameters
|
120
|
+
----------
|
121
|
+
value : str
|
122
|
+
The path to validate.
|
123
|
+
|
124
|
+
Returns
|
125
|
+
-------
|
126
|
+
str
|
127
|
+
The validated file path.
|
128
|
+
|
129
|
+
Raises
|
130
|
+
------
|
131
|
+
argparse.ArgumentTypeError
|
132
|
+
If the file does not exist.
|
133
|
+
"""
|
134
|
+
if value is not None:
|
135
|
+
if not os.path.isfile(value):
|
136
|
+
raise argparse.ArgumentTypeError(f"{value} is not found")
|
137
|
+
return value
|
138
|
+
|
139
|
+
|
140
|
+
def validate_stat_type(value: str) -> str:
|
141
|
+
"""
|
142
|
+
Validate the input `stat_type`.
|
143
|
+
|
144
|
+
Parameters
|
145
|
+
----------
|
146
|
+
value : str
|
147
|
+
The statistic type to validate. Must be either:
|
148
|
+
- "U" : Compute the U statistic.
|
149
|
+
- "QXX" : Compute the Q statistic, where "XX" is a one or two-digit integer
|
150
|
+
representing the quantile percentage (e.g., "Q95" for 95th quantile).
|
151
|
+
|
152
|
+
Returns
|
153
|
+
-------
|
154
|
+
str
|
155
|
+
The validated `stat_type`, either "U" or "QXX".
|
156
|
+
|
157
|
+
Raises
|
158
|
+
------
|
159
|
+
argparse.ArgumentTypeError
|
160
|
+
If the input does not match the expected format ("U" or "QXX").
|
161
|
+
"""
|
162
|
+
if re.fullmatch(
|
163
|
+
r"[UQ]\d{2}", value
|
164
|
+
): # Matches U or Q followed by exactly two digits
|
165
|
+
return value
|
166
|
+
else:
|
167
|
+
raise argparse.ArgumentTypeError(
|
168
|
+
f"Invalid --stat-type: {value}. Must be 'UXX' or 'QXX' (e.g., 'U05' for x > 0.05, 'Q95' for quantile = 0.95)."
|
169
|
+
)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
from sai.parsers.argument_validation import existed_file
|
23
|
+
from sai.parsers.argument_validation import between_zero_and_one
|
24
|
+
from sai.sai import outlier
|
25
|
+
|
26
|
+
|
27
|
+
def _run_outlier(args: argparse.Namespace) -> None:
|
28
|
+
"""
|
29
|
+
Runs the outlier detection process based on command-line arguments.
|
30
|
+
|
31
|
+
Parameters
|
32
|
+
----------
|
33
|
+
args : argparse.Namespace
|
34
|
+
Parsed command-line arguments containing input score file,
|
35
|
+
output file, quantile threshold, and stat type.
|
36
|
+
"""
|
37
|
+
# Call the outlier function with parsed arguments
|
38
|
+
outlier(
|
39
|
+
score_file=args.score,
|
40
|
+
output=args.output,
|
41
|
+
quantile=args.quantile,
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
def add_outlier_parser(subparsers: argparse.ArgumentParser) -> None:
|
46
|
+
"""
|
47
|
+
Initializes and configures the command-line interface parser
|
48
|
+
for the outlier subcommand.
|
49
|
+
|
50
|
+
Parameters
|
51
|
+
----------
|
52
|
+
subparsers : argparse.ArgumentParser
|
53
|
+
A command-line interface parser to be configured.
|
54
|
+
"""
|
55
|
+
parser = subparsers.add_parser(
|
56
|
+
"outlier", help="Detect and output outlier rows based on quantile thresholds."
|
57
|
+
)
|
58
|
+
parser.add_argument(
|
59
|
+
"--score",
|
60
|
+
type=existed_file,
|
61
|
+
required=True,
|
62
|
+
help="Path to the input score file.",
|
63
|
+
)
|
64
|
+
parser.add_argument(
|
65
|
+
"--output",
|
66
|
+
type=str,
|
67
|
+
required=True,
|
68
|
+
help="Path to save the output file.",
|
69
|
+
)
|
70
|
+
parser.add_argument(
|
71
|
+
"--quantile",
|
72
|
+
type=between_zero_and_one,
|
73
|
+
default=0.99,
|
74
|
+
help="Quantile threshold for outlier detection, between 0 and 1. Default: 0.99.",
|
75
|
+
)
|
76
|
+
parser.set_defaults(runner=_run_outlier)
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
from sai.parsers.argument_validation import positive_int
|
23
|
+
from sai.parsers.argument_validation import positive_number
|
24
|
+
from sai.parsers.argument_validation import existed_file
|
25
|
+
from sai.sai import plot
|
26
|
+
|
27
|
+
|
28
|
+
def _run_plot(args: argparse.Namespace) -> None:
|
29
|
+
"""
|
30
|
+
Runs the plotting process based on command-line arguments.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
args : argparse.Namespace
|
35
|
+
Parsed command-line arguments containing input files, output file,
|
36
|
+
xlabel, ylabel, title, figsize_x, figsize_y, dpi, alpha,
|
37
|
+
marker_size, marker_color, and marker_style.
|
38
|
+
"""
|
39
|
+
plot(
|
40
|
+
u_file=args.u_file,
|
41
|
+
q_file=args.q_file,
|
42
|
+
output=args.output,
|
43
|
+
xlabel=args.xlabel,
|
44
|
+
ylabel=args.ylabel,
|
45
|
+
title=args.title,
|
46
|
+
figsize_x=args.figsize_x,
|
47
|
+
figsize_y=args.figsize_y,
|
48
|
+
dpi=args.dpi,
|
49
|
+
alpha=args.alpha,
|
50
|
+
marker_size=args.marker_size,
|
51
|
+
marker_color=args.marker_color,
|
52
|
+
marker_style=args.marker_style,
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
def add_plot_parser(subparsers: argparse.ArgumentParser) -> None:
|
57
|
+
"""
|
58
|
+
Initializes and configures the command-line interface parser
|
59
|
+
for the plot subcommand.
|
60
|
+
|
61
|
+
Parameters
|
62
|
+
----------
|
63
|
+
subparsers : argparse.ArgumentParser
|
64
|
+
A command-line interface parser to be configured.
|
65
|
+
"""
|
66
|
+
parser = subparsers.add_parser(
|
67
|
+
"plot", help="Generate a scatter plot of U vs Q statistics."
|
68
|
+
)
|
69
|
+
parser.add_argument(
|
70
|
+
"--u-file",
|
71
|
+
dest="u_file",
|
72
|
+
type=existed_file,
|
73
|
+
required=True,
|
74
|
+
help="Path to the U score/outlier file.",
|
75
|
+
)
|
76
|
+
parser.add_argument(
|
77
|
+
"--q-file",
|
78
|
+
dest="q_file",
|
79
|
+
type=existed_file,
|
80
|
+
required=True,
|
81
|
+
help="Path to the Q score/outlier file.",
|
82
|
+
)
|
83
|
+
parser.add_argument(
|
84
|
+
"--output",
|
85
|
+
type=str,
|
86
|
+
required=True,
|
87
|
+
help="Path to save the output plot file. The format depends on the file extension (e.g., `.png`, `.pdf`).",
|
88
|
+
)
|
89
|
+
parser.add_argument(
|
90
|
+
"--xlabel",
|
91
|
+
type=str,
|
92
|
+
default="Q Statistic",
|
93
|
+
help="Label for the X-axis. Default: Q Statistic.",
|
94
|
+
)
|
95
|
+
parser.add_argument(
|
96
|
+
"--ylabel",
|
97
|
+
type=str,
|
98
|
+
default="U Statistic",
|
99
|
+
help="Label for the Y-axis. Default: U Statistic.",
|
100
|
+
)
|
101
|
+
parser.add_argument(
|
102
|
+
"--title",
|
103
|
+
type=str,
|
104
|
+
default="Scatter Plot of U vs Q",
|
105
|
+
help="Title of the plot. Default: Scatter Plot of U vs Q.",
|
106
|
+
)
|
107
|
+
parser.add_argument(
|
108
|
+
"--figsize-x",
|
109
|
+
type=positive_number,
|
110
|
+
default=6,
|
111
|
+
help="Width of the figure (in inches). Default: 6.",
|
112
|
+
)
|
113
|
+
parser.add_argument(
|
114
|
+
"--figsize-y",
|
115
|
+
type=positive_number,
|
116
|
+
default=6,
|
117
|
+
help="Height of the figure (in inches). Default: 6.",
|
118
|
+
)
|
119
|
+
parser.add_argument(
|
120
|
+
"--dpi",
|
121
|
+
type=positive_int,
|
122
|
+
default=300,
|
123
|
+
help="Resolution of the saved plot. Default: 300.",
|
124
|
+
)
|
125
|
+
parser.add_argument(
|
126
|
+
"--alpha",
|
127
|
+
type=positive_number,
|
128
|
+
default=0.6,
|
129
|
+
help="Transparency level of scatter points. Default: 0.6.",
|
130
|
+
)
|
131
|
+
parser.add_argument(
|
132
|
+
"--marker-size",
|
133
|
+
dest="marker_size",
|
134
|
+
type=positive_number,
|
135
|
+
default=20,
|
136
|
+
help="Size of the scatter plot markers. See matplotlib.pyplot.scatter. Default: 20.",
|
137
|
+
)
|
138
|
+
parser.add_argument(
|
139
|
+
"--marker-color",
|
140
|
+
dest="marker_color",
|
141
|
+
type=str,
|
142
|
+
default="blue",
|
143
|
+
help="Color of the markers. See matplotlib.pyplot.scatter. Default: blue.",
|
144
|
+
)
|
145
|
+
parser.add_argument(
|
146
|
+
"--marker-style",
|
147
|
+
dest="marker_style",
|
148
|
+
type=str,
|
149
|
+
default="o",
|
150
|
+
help="Shape of the markers. See matplotlib.pyplot.scatter. Default: o.",
|
151
|
+
)
|
152
|
+
parser.set_defaults(runner=_run_plot)
|