smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,31 @@
|
|
|
1
|
-
|
|
1
|
+
from smftools.logging_utils import get_logger
|
|
2
|
+
|
|
3
|
+
logger = get_logger(__name__)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def refine_nucleosome_calls(
|
|
7
|
+
adata,
|
|
8
|
+
layer_name,
|
|
9
|
+
nan_mask_layer,
|
|
10
|
+
hexamer_size=120,
|
|
11
|
+
octamer_size=147,
|
|
12
|
+
max_wiggle=40,
|
|
13
|
+
device="cpu",
|
|
14
|
+
):
|
|
15
|
+
"""Refine nucleosome calls into hexamer/octamer layers.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
adata: AnnData with nucleosome calls.
|
|
19
|
+
layer_name: Layer containing initial nucleosome calls.
|
|
20
|
+
nan_mask_layer: Layer indicating NaN regions.
|
|
21
|
+
hexamer_size: Size for hexamer placement.
|
|
22
|
+
octamer_size: Size for octamer placement.
|
|
23
|
+
max_wiggle: Max boundary expansion into NaNs.
|
|
24
|
+
device: Device specifier (unused; kept for API parity).
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Updated AnnData with hexamer/octamer layers.
|
|
28
|
+
"""
|
|
2
29
|
import numpy as np
|
|
3
30
|
|
|
4
31
|
nucleosome_layer = adata.layers[layer_name]
|
|
@@ -31,7 +58,10 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
|
|
|
31
58
|
break
|
|
32
59
|
# Right
|
|
33
60
|
for i in range(1, max_wiggle + 1):
|
|
34
|
-
if
|
|
61
|
+
if (
|
|
62
|
+
end_idx + i < nucleosome_layer.shape[1]
|
|
63
|
+
and nan_mask[read_idx, end_idx + i] == 1
|
|
64
|
+
):
|
|
35
65
|
right_expand += 1
|
|
36
66
|
else:
|
|
37
67
|
break
|
|
@@ -40,26 +70,55 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
|
|
|
40
70
|
expanded_end = end_idx + right_expand
|
|
41
71
|
|
|
42
72
|
available_size = expanded_end - expanded_start
|
|
43
|
-
|
|
73
|
+
|
|
44
74
|
# Octamer placement
|
|
45
75
|
if available_size >= octamer_size:
|
|
46
76
|
center = (expanded_start + expanded_end) // 2
|
|
47
77
|
half_oct = octamer_size // 2
|
|
48
|
-
octamer_layer[
|
|
78
|
+
octamer_layer[
|
|
79
|
+
read_idx, center - half_oct : center - half_oct + octamer_size
|
|
80
|
+
] = 1
|
|
49
81
|
|
|
50
82
|
# Hexamer placement
|
|
51
83
|
elif available_size >= hexamer_size:
|
|
52
84
|
center = (expanded_start + expanded_end) // 2
|
|
53
85
|
half_hex = hexamer_size // 2
|
|
54
|
-
hexamer_layer[
|
|
86
|
+
hexamer_layer[
|
|
87
|
+
read_idx, center - half_hex : center - half_hex + hexamer_size
|
|
88
|
+
] = 1
|
|
55
89
|
|
|
56
90
|
adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
|
|
57
91
|
adata.layers[f"{layer_name}_octamers"] = octamer_layer
|
|
58
92
|
|
|
59
|
-
|
|
93
|
+
logger.info("Added layers: %s_hexamers and %s_octamers", layer_name, layer_name)
|
|
60
94
|
return adata
|
|
61
95
|
|
|
62
|
-
|
|
96
|
+
|
|
97
|
+
def infer_nucleosomes_in_large_bound(
|
|
98
|
+
adata,
|
|
99
|
+
large_bound_layer,
|
|
100
|
+
combined_nuc_layer,
|
|
101
|
+
nan_mask_layer,
|
|
102
|
+
nuc_size=147,
|
|
103
|
+
linker_size=50,
|
|
104
|
+
exclusion_buffer=30,
|
|
105
|
+
device="cpu",
|
|
106
|
+
):
|
|
107
|
+
"""Infer nucleosomes in large-bound regions while respecting exclusions.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
adata: AnnData with bound regions and existing nucleosomes.
|
|
111
|
+
large_bound_layer: Layer marking large-bound segments.
|
|
112
|
+
combined_nuc_layer: Layer with existing nucleosome calls.
|
|
113
|
+
nan_mask_layer: Layer indicating NaN regions.
|
|
114
|
+
nuc_size: Nucleosome size in bp.
|
|
115
|
+
linker_size: Minimum linker spacing.
|
|
116
|
+
exclusion_buffer: Buffer to avoid nearby existing nucleosomes.
|
|
117
|
+
device: Device specifier (unused; kept for API parity).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Updated AnnData with inferred nucleosome layer.
|
|
121
|
+
"""
|
|
63
122
|
import numpy as np
|
|
64
123
|
|
|
65
124
|
large_bound = adata.layers[large_bound_layer]
|
|
@@ -82,23 +141,52 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
|
|
|
82
141
|
|
|
83
142
|
# Adjust boundaries into flanking NaN regions without getting too close to existing nucleosomes
|
|
84
143
|
left_expand = start_idx
|
|
85
|
-
while
|
|
144
|
+
while (
|
|
145
|
+
left_expand > 0
|
|
146
|
+
and nan_mask[read_idx, left_expand - 1] == 1
|
|
147
|
+
and np.sum(
|
|
148
|
+
existing_nucs[
|
|
149
|
+
read_idx, max(0, left_expand - exclusion_buffer) : left_expand
|
|
150
|
+
]
|
|
151
|
+
)
|
|
152
|
+
== 0
|
|
153
|
+
):
|
|
86
154
|
left_expand -= 1
|
|
87
155
|
|
|
88
156
|
right_expand = end_idx
|
|
89
|
-
while
|
|
157
|
+
while (
|
|
158
|
+
right_expand < row.shape[0]
|
|
159
|
+
and nan_mask[read_idx, right_expand] == 1
|
|
160
|
+
and np.sum(
|
|
161
|
+
existing_nucs[
|
|
162
|
+
read_idx,
|
|
163
|
+
right_expand : min(row.shape[0], right_expand + exclusion_buffer),
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
== 0
|
|
167
|
+
):
|
|
90
168
|
right_expand += 1
|
|
91
169
|
|
|
92
170
|
# Phase nucleosomes with linker spacing only
|
|
93
171
|
region = (left_expand, right_expand)
|
|
94
172
|
pos_cursor = region[0]
|
|
95
173
|
while pos_cursor + nuc_size <= region[1]:
|
|
96
|
-
if np.all(
|
|
97
|
-
|
|
98
|
-
|
|
174
|
+
if np.all(
|
|
175
|
+
(
|
|
176
|
+
existing_nucs[
|
|
177
|
+
read_idx,
|
|
178
|
+
pos_cursor - exclusion_buffer : pos_cursor
|
|
179
|
+
+ nuc_size
|
|
180
|
+
+ exclusion_buffer,
|
|
181
|
+
]
|
|
182
|
+
== 0
|
|
183
|
+
)
|
|
184
|
+
):
|
|
185
|
+
inferred_layer[read_idx, pos_cursor : pos_cursor + nuc_size] = 1
|
|
186
|
+
pos_cursor += nuc_size + linker_size
|
|
99
187
|
else:
|
|
100
188
|
pos_cursor += 1
|
|
101
189
|
|
|
102
190
|
adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
|
|
103
|
-
|
|
104
|
-
return adata
|
|
191
|
+
logger.info("Added layer: %s_phased_nucleosomes", large_bound_layer)
|
|
192
|
+
return adata
|
smftools/informatics/__init__.py
CHANGED
|
@@ -1,12 +1,35 @@
|
|
|
1
|
-
from .bam_functions import
|
|
1
|
+
from .bam_functions import (
|
|
2
|
+
align_and_sort_BAM,
|
|
3
|
+
bam_qc,
|
|
4
|
+
concatenate_fastqs_to_bam,
|
|
5
|
+
count_aligned_reads,
|
|
6
|
+
demux_and_index_BAM,
|
|
7
|
+
extract_base_identities,
|
|
8
|
+
extract_read_features_from_bam,
|
|
9
|
+
extract_readnames_from_bam,
|
|
10
|
+
separate_bam_by_bc,
|
|
11
|
+
split_and_index_BAM,
|
|
12
|
+
)
|
|
2
13
|
from .basecalling import canoncall, modcall
|
|
3
|
-
from .bed_functions import
|
|
14
|
+
from .bed_functions import (
|
|
15
|
+
_bed_to_bigwig,
|
|
16
|
+
_plot_bed_histograms,
|
|
17
|
+
aligned_BAM_to_bed,
|
|
18
|
+
extract_read_lengths_from_bed,
|
|
19
|
+
)
|
|
4
20
|
from .converted_BAM_to_adata import converted_BAM_to_adata
|
|
5
|
-
from .fasta_functions import
|
|
21
|
+
from .fasta_functions import (
|
|
22
|
+
find_conversion_sites,
|
|
23
|
+
generate_converted_FASTA,
|
|
24
|
+
get_chromosome_lengths,
|
|
25
|
+
get_native_references,
|
|
26
|
+
index_fasta,
|
|
27
|
+
subsample_fasta_from_bed,
|
|
28
|
+
)
|
|
6
29
|
from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
|
|
7
|
-
from .modkit_functions import extract_mods, make_modbed, modQC
|
|
8
30
|
from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
9
|
-
from .
|
|
31
|
+
from .modkit_functions import extract_mods, make_modbed, modQC
|
|
32
|
+
from .ohe import ohe_batching, ohe_layers_decode, one_hot_decode, one_hot_encode
|
|
10
33
|
from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
|
|
11
34
|
from .run_multiqc import run_multiqc
|
|
12
35
|
|
|
@@ -16,5 +39,5 @@ __all__ = [
|
|
|
16
39
|
"subsample_fasta_from_bed",
|
|
17
40
|
"subsample_pod5",
|
|
18
41
|
"fast5_to_pod5",
|
|
19
|
-
"run_multiqc"
|
|
20
|
-
]
|
|
42
|
+
"run_multiqc",
|
|
43
|
+
]
|
|
@@ -20,6 +20,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
20
20
|
fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
|
|
21
21
|
|
|
22
22
|
def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
23
|
+
"""Sort a BAM file using pysam.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
in_bam: Input BAM path.
|
|
27
|
+
out_bam: Output BAM path.
|
|
28
|
+
threads: Optional thread count.
|
|
29
|
+
"""
|
|
23
30
|
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
24
31
|
args = []
|
|
25
32
|
if threads:
|
|
@@ -28,6 +35,12 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
|
|
|
28
35
|
pysam.sort(*args)
|
|
29
36
|
|
|
30
37
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
38
|
+
"""Index a BAM file using pysam.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
bam_path: BAM path to index.
|
|
42
|
+
threads: Optional thread count.
|
|
43
|
+
"""
|
|
31
44
|
bam_path = str(bam_path)
|
|
32
45
|
# pysam.index supports samtools-style args
|
|
33
46
|
if threads:
|
|
@@ -123,4 +136,4 @@ def align_and_sort_BAM(fasta,
|
|
|
123
136
|
# index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
|
|
124
137
|
# else:
|
|
125
138
|
# index_command = ["samtools", "index", aligned_sorted_output]
|
|
126
|
-
# subprocess.run(index_command)
|
|
139
|
+
# subprocess.run(index_command)
|
|
@@ -35,6 +35,7 @@ def bam_qc(
|
|
|
35
35
|
bam_files = [Path(b) for b in bam_files]
|
|
36
36
|
|
|
37
37
|
def _has_index(p: Path) -> bool:
|
|
38
|
+
"""Return True if a BAM/CRAM index exists for the path."""
|
|
38
39
|
if p.suffix.lower() == ".bam":
|
|
39
40
|
bai = p.with_suffix(p.suffix + ".bai")
|
|
40
41
|
bai_alt = Path(str(p) + ".bai")
|
|
@@ -45,6 +46,7 @@ def bam_qc(
|
|
|
45
46
|
return False
|
|
46
47
|
|
|
47
48
|
def _ensure_index(p: Path) -> None:
|
|
49
|
+
"""Ensure a BAM/CRAM index exists, creating one if needed."""
|
|
48
50
|
if _has_index(p):
|
|
49
51
|
return
|
|
50
52
|
if HAVE_PYSAM:
|
|
@@ -55,6 +57,14 @@ def bam_qc(
|
|
|
55
57
|
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
56
58
|
|
|
57
59
|
def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
|
|
60
|
+
"""Run QC tasks for a single BAM file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
bam: Path to the BAM file.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Tuple of (bam_path, list of (task_name, return_code)).
|
|
67
|
+
"""
|
|
58
68
|
# outputs + return (file, [(task_name, returncode)])
|
|
59
69
|
results: List[Tuple[str, int]] = []
|
|
60
70
|
base = bam.stem # filename without .bam
|
|
@@ -71,6 +81,7 @@ def bam_qc(
|
|
|
71
81
|
|
|
72
82
|
# Choose runner per task
|
|
73
83
|
def run_stats():
|
|
84
|
+
"""Run stats collection for a BAM file."""
|
|
74
85
|
if not stats:
|
|
75
86
|
return
|
|
76
87
|
if HAVE_PYSAM and hasattr(pysam, "stats"):
|
|
@@ -86,6 +97,7 @@ def bam_qc(
|
|
|
86
97
|
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
87
98
|
|
|
88
99
|
def run_flagstat():
|
|
100
|
+
"""Run flagstat collection for a BAM file."""
|
|
89
101
|
if not flagstats:
|
|
90
102
|
return
|
|
91
103
|
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
@@ -101,6 +113,7 @@ def bam_qc(
|
|
|
101
113
|
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
102
114
|
|
|
103
115
|
def run_idxstats():
|
|
116
|
+
"""Run idxstats collection for a BAM file."""
|
|
104
117
|
if not idxstats:
|
|
105
118
|
return
|
|
106
119
|
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
@@ -210,4 +223,4 @@ def bam_qc(
|
|
|
210
223
|
# elif modality == 'direct':
|
|
211
224
|
# pass
|
|
212
225
|
|
|
213
|
-
# print("QC processing completed.")
|
|
226
|
+
# print("QC processing completed.")
|
|
@@ -60,6 +60,7 @@ def concatenate_fastqs_to_bam(
|
|
|
60
60
|
return p.stem # fallback: remove last suffix only
|
|
61
61
|
|
|
62
62
|
def _extract_barcode_from_filename(p: Path) -> str:
|
|
63
|
+
"""Extract a barcode token from a FASTQ filename."""
|
|
63
64
|
stem = _strip_fastq_ext(p)
|
|
64
65
|
if "_" in stem:
|
|
65
66
|
token = stem.split("_")[-1]
|
|
@@ -68,6 +69,7 @@ def concatenate_fastqs_to_bam(
|
|
|
68
69
|
return stem
|
|
69
70
|
|
|
70
71
|
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
72
|
+
"""Classify a FASTQ filename stem into (prefix, read_number)."""
|
|
71
73
|
# return (prefix, readnum) if matches; else (None, None)
|
|
72
74
|
patterns = [
|
|
73
75
|
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
@@ -80,6 +82,7 @@ def concatenate_fastqs_to_bam(
|
|
|
80
82
|
return None, None
|
|
81
83
|
|
|
82
84
|
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
85
|
+
"""Pair FASTQ files based on filename conventions."""
|
|
83
86
|
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
84
87
|
unpaired: List[Path] = []
|
|
85
88
|
for pth in paths:
|
|
@@ -101,6 +104,7 @@ def concatenate_fastqs_to_bam(
|
|
|
101
104
|
return pairs, leftovers
|
|
102
105
|
|
|
103
106
|
def _fastq_iter(p: Path):
|
|
107
|
+
"""Yield FASTQ records using pysam.FastxFile."""
|
|
104
108
|
# pysam.FastxFile handles compressed extensions transparently
|
|
105
109
|
with pysam.FastxFile(str(p)) as fx:
|
|
106
110
|
for rec in fx:
|
|
@@ -114,6 +118,7 @@ def concatenate_fastqs_to_bam(
|
|
|
114
118
|
read1: bool,
|
|
115
119
|
read2: bool,
|
|
116
120
|
) -> pysam.AlignedSegment:
|
|
121
|
+
"""Construct an unaligned pysam.AlignedSegment."""
|
|
117
122
|
a = pysam.AlignedSegment()
|
|
118
123
|
a.query_name = name
|
|
119
124
|
a.query_sequence = seq
|
|
@@ -136,6 +141,7 @@ def concatenate_fastqs_to_bam(
|
|
|
136
141
|
|
|
137
142
|
# ---------- normalize inputs to Path ----------
|
|
138
143
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
144
|
+
"""Convert a tuple of path-like objects to Path instances."""
|
|
139
145
|
a, b = x
|
|
140
146
|
return Path(a), Path(b)
|
|
141
147
|
|
|
@@ -205,6 +211,7 @@ def concatenate_fastqs_to_bam(
|
|
|
205
211
|
|
|
206
212
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
207
213
|
def _clean(n: Optional[str]) -> Optional[str]:
|
|
214
|
+
"""Normalize FASTQ read names by trimming read suffixes."""
|
|
208
215
|
if n is None:
|
|
209
216
|
return None
|
|
210
217
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
@@ -256,4 +263,4 @@ def concatenate_fastqs_to_bam(
|
|
|
256
263
|
"paired_pairs_written": paired_pairs_written,
|
|
257
264
|
"singletons_written": singletons_written,
|
|
258
265
|
"barcodes": barcodes_in_order,
|
|
259
|
-
}
|
|
266
|
+
}
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# load_adata
|
|
2
2
|
######################################################################################################
|
|
3
|
-
|
|
3
|
+
# Archived helper; legacy imports removed for syntax compatibility.
|
|
4
4
|
# File I/O
|
|
5
5
|
import subprocess
|
|
6
6
|
import gc
|
|
7
7
|
|
|
8
8
|
# bioinformatic operations
|
|
9
|
-
import .informatics_module
|
|
9
|
+
# import .informatics_module
|
|
10
10
|
|
|
11
11
|
# User interface
|
|
12
12
|
from tqdm import tqdm
|
|
@@ -513,4 +513,4 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
|
|
|
513
513
|
print(f"Deleted file: {hdf}")
|
|
514
514
|
except OSError as e:
|
|
515
515
|
print(f"Error deleting file {hdf}: {e}")
|
|
516
|
-
######################################################################################################
|
|
516
|
+
######################################################################################################
|
|
@@ -86,6 +86,7 @@ def plot_bed_histograms(
|
|
|
86
86
|
|
|
87
87
|
# Clip helper for hist tails
|
|
88
88
|
def _clip_series(s, q=(0.0, 0.995)):
|
|
89
|
+
"""Clip a Series to quantile bounds for plotting."""
|
|
89
90
|
if q is None:
|
|
90
91
|
return s.to_numpy()
|
|
91
92
|
lo = s.quantile(q[0]) if q[0] is not None else s.min()
|
|
@@ -109,6 +110,7 @@ def plot_bed_histograms(
|
|
|
109
110
|
|
|
110
111
|
# Pagination
|
|
111
112
|
def _sanitize(name: str) -> str:
|
|
113
|
+
"""Sanitize a string for use in filenames."""
|
|
112
114
|
return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
|
|
113
115
|
|
|
114
116
|
cols_per_fig = 4 if include_mapq_quality else 2
|
|
@@ -247,4 +249,4 @@ def plot_bed_histograms(
|
|
|
247
249
|
# plt.grid(True)
|
|
248
250
|
# save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
|
|
249
251
|
# plt.savefig(save_name)
|
|
250
|
-
# plt.close()
|
|
252
|
+
# plt.close()
|
|
@@ -2,6 +2,12 @@ import pysam
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
def extract_reads(bam_file_path, num_reads=10):
|
|
5
|
+
"""Print sequences for the first N reads in a BAM file.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
bam_file_path: Path to BAM file.
|
|
9
|
+
num_reads: Number of reads to print.
|
|
10
|
+
"""
|
|
5
11
|
# Open the BAM file
|
|
6
12
|
bam_file = pysam.AlignmentFile(bam_file_path, "rb")
|
|
7
13
|
|
|
@@ -26,4 +32,4 @@ if __name__ == "__main__":
|
|
|
26
32
|
bam_file_path = sys.argv[1]
|
|
27
33
|
|
|
28
34
|
# Call the function to extract the first 10 reads
|
|
29
|
-
extract_reads(bam_file_path)
|
|
35
|
+
extract_reads(bam_file_path)
|