smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,30 @@
1
- def load_hmm(model_path, device='cpu'):
1
+ from __future__ import annotations
2
+
3
+ from smftools.optional_imports import require
4
+
5
+
6
+ def load_hmm(model_path, device="cpu"):
2
7
  """
3
8
  Reads in a pretrained HMM.
4
-
9
+
5
10
  Parameters:
6
11
  model_path (str): Path to a pretrained HMM
7
12
  """
8
- import torch
13
+ torch = require("torch", extra="torch", purpose="HMM read/write")
14
+
9
15
  # Load model using PyTorch
10
16
  hmm = torch.load(model_path)
11
- hmm.to(device)
17
+ hmm.to(device)
12
18
  return hmm
13
19
 
20
+
14
21
  def save_hmm(model, model_path):
15
- import torch
16
- torch.save(model, model_path)
22
+ """Save a pretrained HMM to disk.
23
+
24
+ Args:
25
+ model: HMM model instance.
26
+ model_path: Output path for the model.
27
+ """
28
+ torch = require("torch", extra="torch", purpose="HMM read/write")
29
+
30
+ torch.save(model, model_path)
@@ -1,4 +1,33 @@
1
- def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120, octamer_size=147, max_wiggle=40, device="cpu"):
1
+ from __future__ import annotations
2
+
3
+ from smftools.logging_utils import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ def refine_nucleosome_calls(
9
+ adata,
10
+ layer_name,
11
+ nan_mask_layer,
12
+ hexamer_size=120,
13
+ octamer_size=147,
14
+ max_wiggle=40,
15
+ device="cpu",
16
+ ):
17
+ """Refine nucleosome calls into hexamer/octamer layers.
18
+
19
+ Args:
20
+ adata: AnnData with nucleosome calls.
21
+ layer_name: Layer containing initial nucleosome calls.
22
+ nan_mask_layer: Layer indicating NaN regions.
23
+ hexamer_size: Size for hexamer placement.
24
+ octamer_size: Size for octamer placement.
25
+ max_wiggle: Max boundary expansion into NaNs.
26
+ device: Device specifier (unused; kept for API parity).
27
+
28
+ Returns:
29
+ Updated AnnData with hexamer/octamer layers.
30
+ """
2
31
  import numpy as np
3
32
 
4
33
  nucleosome_layer = adata.layers[layer_name]
@@ -31,7 +60,10 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
31
60
  break
32
61
  # Right
33
62
  for i in range(1, max_wiggle + 1):
34
- if end_idx + i < nucleosome_layer.shape[1] and nan_mask[read_idx, end_idx + i] == 1:
63
+ if (
64
+ end_idx + i < nucleosome_layer.shape[1]
65
+ and nan_mask[read_idx, end_idx + i] == 1
66
+ ):
35
67
  right_expand += 1
36
68
  else:
37
69
  break
@@ -40,26 +72,55 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
40
72
  expanded_end = end_idx + right_expand
41
73
 
42
74
  available_size = expanded_end - expanded_start
43
-
75
+
44
76
  # Octamer placement
45
77
  if available_size >= octamer_size:
46
78
  center = (expanded_start + expanded_end) // 2
47
79
  half_oct = octamer_size // 2
48
- octamer_layer[read_idx, center - half_oct: center - half_oct + octamer_size] = 1
80
+ octamer_layer[
81
+ read_idx, center - half_oct : center - half_oct + octamer_size
82
+ ] = 1
49
83
 
50
84
  # Hexamer placement
51
85
  elif available_size >= hexamer_size:
52
86
  center = (expanded_start + expanded_end) // 2
53
87
  half_hex = hexamer_size // 2
54
- hexamer_layer[read_idx, center - half_hex: center - half_hex + hexamer_size] = 1
88
+ hexamer_layer[
89
+ read_idx, center - half_hex : center - half_hex + hexamer_size
90
+ ] = 1
55
91
 
56
92
  adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
57
93
  adata.layers[f"{layer_name}_octamers"] = octamer_layer
58
94
 
59
- print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
95
+ logger.info("Added layers: %s_hexamers and %s_octamers", layer_name, layer_name)
60
96
  return adata
61
97
 
62
- def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
98
+
99
+ def infer_nucleosomes_in_large_bound(
100
+ adata,
101
+ large_bound_layer,
102
+ combined_nuc_layer,
103
+ nan_mask_layer,
104
+ nuc_size=147,
105
+ linker_size=50,
106
+ exclusion_buffer=30,
107
+ device="cpu",
108
+ ):
109
+ """Infer nucleosomes in large-bound regions while respecting exclusions.
110
+
111
+ Args:
112
+ adata: AnnData with bound regions and existing nucleosomes.
113
+ large_bound_layer: Layer marking large-bound segments.
114
+ combined_nuc_layer: Layer with existing nucleosome calls.
115
+ nan_mask_layer: Layer indicating NaN regions.
116
+ nuc_size: Nucleosome size in bp.
117
+ linker_size: Minimum linker spacing.
118
+ exclusion_buffer: Buffer to avoid nearby existing nucleosomes.
119
+ device: Device specifier (unused; kept for API parity).
120
+
121
+ Returns:
122
+ Updated AnnData with inferred nucleosome layer.
123
+ """
63
124
  import numpy as np
64
125
 
65
126
  large_bound = adata.layers[large_bound_layer]
@@ -82,23 +143,52 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
82
143
 
83
144
  # Adjust boundaries into flanking NaN regions without getting too close to existing nucleosomes
84
145
  left_expand = start_idx
85
- while left_expand > 0 and nan_mask[read_idx, left_expand - 1] == 1 and np.sum(existing_nucs[read_idx, max(0, left_expand - exclusion_buffer):left_expand]) == 0:
146
+ while (
147
+ left_expand > 0
148
+ and nan_mask[read_idx, left_expand - 1] == 1
149
+ and np.sum(
150
+ existing_nucs[
151
+ read_idx, max(0, left_expand - exclusion_buffer) : left_expand
152
+ ]
153
+ )
154
+ == 0
155
+ ):
86
156
  left_expand -= 1
87
157
 
88
158
  right_expand = end_idx
89
- while right_expand < row.shape[0] and nan_mask[read_idx, right_expand] == 1 and np.sum(existing_nucs[read_idx, right_expand:min(row.shape[0], right_expand + exclusion_buffer)]) == 0:
159
+ while (
160
+ right_expand < row.shape[0]
161
+ and nan_mask[read_idx, right_expand] == 1
162
+ and np.sum(
163
+ existing_nucs[
164
+ read_idx,
165
+ right_expand : min(row.shape[0], right_expand + exclusion_buffer),
166
+ ]
167
+ )
168
+ == 0
169
+ ):
90
170
  right_expand += 1
91
171
 
92
172
  # Phase nucleosomes with linker spacing only
93
173
  region = (left_expand, right_expand)
94
174
  pos_cursor = region[0]
95
175
  while pos_cursor + nuc_size <= region[1]:
96
- if np.all((existing_nucs[read_idx, pos_cursor - exclusion_buffer:pos_cursor + nuc_size + exclusion_buffer] == 0)):
97
- inferred_layer[read_idx, pos_cursor:pos_cursor + nuc_size] = 1
98
- pos_cursor += nuc_size + linker_size
176
+ if np.all(
177
+ (
178
+ existing_nucs[
179
+ read_idx,
180
+ pos_cursor - exclusion_buffer : pos_cursor
181
+ + nuc_size
182
+ + exclusion_buffer,
183
+ ]
184
+ == 0
185
+ )
186
+ ):
187
+ inferred_layer[read_idx, pos_cursor : pos_cursor + nuc_size] = 1
188
+ pos_cursor += nuc_size + linker_size
99
189
  else:
100
190
  pos_cursor += 1
101
191
 
102
192
  adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
103
- print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
104
- return adata
193
+ logger.info("Added layer: %s_phased_nucleosomes", large_bound_layer)
194
+ return adata
@@ -1,14 +1,56 @@
1
- from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
2
- from .basecalling import canoncall, modcall
3
- from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
4
- from .converted_BAM_to_adata import converted_BAM_to_adata
5
- from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
6
- from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
7
- from .modkit_functions import extract_mods, make_modbed, modQC
8
- from .modkit_extract_to_adata import modkit_extract_to_adata
9
- from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
10
- from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
11
- from .run_multiqc import run_multiqc
1
+ from __future__ import annotations
2
+
3
+ from importlib import import_module
4
+
5
+ _LAZY_ATTRS = {
6
+ "_bed_to_bigwig": "smftools.informatics.bed_functions",
7
+ "_plot_bed_histograms": "smftools.informatics.bed_functions",
8
+ "add_demux_type_annotation": "smftools.informatics.h5ad_functions",
9
+ "add_read_length_and_mapping_qc": "smftools.informatics.h5ad_functions",
10
+ "align_and_sort_BAM": "smftools.informatics.bam_functions",
11
+ "bam_qc": "smftools.informatics.bam_functions",
12
+ "basecall_pod5s": "smftools.informatics.pod5_functions",
13
+ "canoncall": "smftools.informatics.basecalling",
14
+ "concatenate_fastqs_to_bam": "smftools.informatics.bam_functions",
15
+ "converted_BAM_to_adata": "smftools.informatics.converted_BAM_to_adata",
16
+ "count_aligned_reads": "smftools.informatics.bam_functions",
17
+ "demux_and_index_BAM": "smftools.informatics.bam_functions",
18
+ "extract_base_identities": "smftools.informatics.bam_functions",
19
+ "extract_mods": "smftools.informatics.modkit_functions",
20
+ "extract_read_features_from_bam": "smftools.informatics.bam_functions",
21
+ "extract_read_lengths_from_bed": "smftools.informatics.bed_functions",
22
+ "extract_readnames_from_bam": "smftools.informatics.bam_functions",
23
+ "fast5_to_pod5": "smftools.informatics.pod5_functions",
24
+ "find_conversion_sites": "smftools.informatics.fasta_functions",
25
+ "generate_converted_FASTA": "smftools.informatics.fasta_functions",
26
+ "get_chromosome_lengths": "smftools.informatics.fasta_functions",
27
+ "get_native_references": "smftools.informatics.fasta_functions",
28
+ "index_fasta": "smftools.informatics.fasta_functions",
29
+ "make_modbed": "smftools.informatics.modkit_functions",
30
+ "modQC": "smftools.informatics.modkit_functions",
31
+ "modcall": "smftools.informatics.basecalling",
32
+ "modkit_extract_to_adata": "smftools.informatics.modkit_extract_to_adata",
33
+ "ohe_batching": "smftools.informatics.ohe",
34
+ "ohe_layers_decode": "smftools.informatics.ohe",
35
+ "one_hot_decode": "smftools.informatics.ohe",
36
+ "one_hot_encode": "smftools.informatics.ohe",
37
+ "run_multiqc": "smftools.informatics.run_multiqc",
38
+ "separate_bam_by_bc": "smftools.informatics.bam_functions",
39
+ "split_and_index_BAM": "smftools.informatics.bam_functions",
40
+ "subsample_fasta_from_bed": "smftools.informatics.fasta_functions",
41
+ "subsample_pod5": "smftools.informatics.pod5_functions",
42
+ "aligned_BAM_to_bed": "smftools.informatics.bed_functions",
43
+ }
44
+
45
+
46
+ def __getattr__(name: str):
47
+ if name in _LAZY_ATTRS:
48
+ module = import_module(_LAZY_ATTRS[name])
49
+ attr = getattr(module, name)
50
+ globals()[name] = attr
51
+ return attr
52
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
53
+
12
54
 
13
55
  __all__ = [
14
56
  "basecall_pod5s",
@@ -16,5 +58,5 @@ __all__ = [
16
58
  "subsample_fasta_from_bed",
17
59
  "subsample_pod5",
18
60
  "fast5_to_pod5",
19
- "run_multiqc"
20
- ]
61
+ "run_multiqc",
62
+ ]
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## bam_conversion
2
4
 
3
5
  def bam_conversion(fasta, output_directory, conversion_types, strands, basecalled_path, split_dir, mapping_threshold, experiment_name, bam_suffix):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## bam_direct
2
4
 
3
5
  def bam_direct(fasta, output_directory, mod_list, thresholds, bam_path, split_dir, mapping_threshold, experiment_name, bam_suffix, batch_size):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # basecall_pod5s
2
4
 
3
5
  def basecall_pod5s(config_path):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## basecalls_to_adata
2
4
 
3
5
  def basecalls_to_adata(config_path):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## conversion_smf
2
4
 
3
5
  def conversion_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
@@ -1,3 +1,4 @@
1
+ from __future__ import annotations
1
2
 
2
3
  def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
3
4
  """
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## direct_smf
2
4
 
3
5
  def direct_smf(fasta, output_directory, mod_list, model_dir, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  import subprocess
3
5
  from typing import Union, List
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # from .align_and_sort_BAM import align_and_sort_BAM
2
4
  # from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
5
  # from .bam_qc import bam_qc
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  import os
3
5
  import subprocess
@@ -20,6 +22,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
20
22
  fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
21
23
 
22
24
  def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
25
+ """Sort a BAM file using pysam.
26
+
27
+ Args:
28
+ in_bam: Input BAM path.
29
+ out_bam: Output BAM path.
30
+ threads: Optional thread count.
31
+ """
23
32
  in_bam, out_bam = str(in_bam), str(out_bam)
24
33
  args = []
25
34
  if threads:
@@ -28,6 +37,12 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
28
37
  pysam.sort(*args)
29
38
 
30
39
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
40
+ """Index a BAM file using pysam.
41
+
42
+ Args:
43
+ bam_path: BAM path to index.
44
+ threads: Optional thread count.
45
+ """
31
46
  bam_path = str(bam_path)
32
47
  # pysam.index supports samtools-style args
33
48
  if threads:
@@ -123,4 +138,4 @@ def align_and_sort_BAM(fasta,
123
138
  # index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
124
139
  # else:
125
140
  # index_command = ["samtools", "index", aligned_sorted_output]
126
- # subprocess.run(index_command)
141
+ # subprocess.run(index_command)
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
2
4
  """
3
5
  Takes an aligned BAM as input and writes a BED file of reads as output.
@@ -35,6 +35,7 @@ def bam_qc(
35
35
  bam_files = [Path(b) for b in bam_files]
36
36
 
37
37
  def _has_index(p: Path) -> bool:
38
+ """Return True if a BAM/CRAM index exists for the path."""
38
39
  if p.suffix.lower() == ".bam":
39
40
  bai = p.with_suffix(p.suffix + ".bai")
40
41
  bai_alt = Path(str(p) + ".bai")
@@ -45,6 +46,7 @@ def bam_qc(
45
46
  return False
46
47
 
47
48
  def _ensure_index(p: Path) -> None:
49
+ """Ensure a BAM/CRAM index exists, creating one if needed."""
48
50
  if _has_index(p):
49
51
  return
50
52
  if HAVE_PYSAM:
@@ -55,6 +57,14 @@ def bam_qc(
55
57
  subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
56
58
 
57
59
  def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
60
+ """Run QC tasks for a single BAM file.
61
+
62
+ Args:
63
+ bam: Path to the BAM file.
64
+
65
+ Returns:
66
+ Tuple of (bam_path, list of (task_name, return_code)).
67
+ """
58
68
  # outputs + return (file, [(task_name, returncode)])
59
69
  results: List[Tuple[str, int]] = []
60
70
  base = bam.stem # filename without .bam
@@ -71,6 +81,7 @@ def bam_qc(
71
81
 
72
82
  # Choose runner per task
73
83
  def run_stats():
84
+ """Run stats collection for a BAM file."""
74
85
  if not stats:
75
86
  return
76
87
  if HAVE_PYSAM and hasattr(pysam, "stats"):
@@ -86,6 +97,7 @@ def bam_qc(
86
97
  raise RuntimeError(cp.stderr.decode(errors="replace"))
87
98
 
88
99
  def run_flagstat():
100
+ """Run flagstat collection for a BAM file."""
89
101
  if not flagstats:
90
102
  return
91
103
  if HAVE_PYSAM and hasattr(pysam, "flagstat"):
@@ -101,6 +113,7 @@ def bam_qc(
101
113
  raise RuntimeError(cp.stderr.decode(errors="replace"))
102
114
 
103
115
  def run_idxstats():
116
+ """Run idxstats collection for a BAM file."""
104
117
  if not idxstats:
105
118
  return
106
119
  if HAVE_PYSAM and hasattr(pysam, "idxstats"):
@@ -210,4 +223,4 @@ def bam_qc(
210
223
  # elif modality == 'direct':
211
224
  # pass
212
225
 
213
- # print("QC processing completed.")
226
+ # print("QC processing completed.")
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  import pybedtools
3
5
  import pyBigWig
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## canoncall
2
4
 
3
5
  # Conversion SMF specific
@@ -60,6 +60,7 @@ def concatenate_fastqs_to_bam(
60
60
  return p.stem # fallback: remove last suffix only
61
61
 
62
62
  def _extract_barcode_from_filename(p: Path) -> str:
63
+ """Extract a barcode token from a FASTQ filename."""
63
64
  stem = _strip_fastq_ext(p)
64
65
  if "_" in stem:
65
66
  token = stem.split("_")[-1]
@@ -68,6 +69,7 @@ def concatenate_fastqs_to_bam(
68
69
  return stem
69
70
 
70
71
  def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
72
+ """Classify a FASTQ filename stem into (prefix, read_number)."""
71
73
  # return (prefix, readnum) if matches; else (None, None)
72
74
  patterns = [
73
75
  r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
@@ -80,6 +82,7 @@ def concatenate_fastqs_to_bam(
80
82
  return None, None
81
83
 
82
84
  def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
85
+ """Pair FASTQ files based on filename conventions."""
83
86
  pref_map: Dict[str, Dict[int, Path]] = {}
84
87
  unpaired: List[Path] = []
85
88
  for pth in paths:
@@ -101,6 +104,7 @@ def concatenate_fastqs_to_bam(
101
104
  return pairs, leftovers
102
105
 
103
106
  def _fastq_iter(p: Path):
107
+ """Yield FASTQ records using pysam.FastxFile."""
104
108
  # pysam.FastxFile handles compressed extensions transparently
105
109
  with pysam.FastxFile(str(p)) as fx:
106
110
  for rec in fx:
@@ -114,6 +118,7 @@ def concatenate_fastqs_to_bam(
114
118
  read1: bool,
115
119
  read2: bool,
116
120
  ) -> pysam.AlignedSegment:
121
+ """Construct an unaligned pysam.AlignedSegment."""
117
122
  a = pysam.AlignedSegment()
118
123
  a.query_name = name
119
124
  a.query_sequence = seq
@@ -136,6 +141,7 @@ def concatenate_fastqs_to_bam(
136
141
 
137
142
  # ---------- normalize inputs to Path ----------
138
143
  def _to_path_pair(x) -> Tuple[Path, Path]:
144
+ """Convert a tuple of path-like objects to Path instances."""
139
145
  a, b = x
140
146
  return Path(a), Path(b)
141
147
 
@@ -205,6 +211,7 @@ def concatenate_fastqs_to_bam(
205
211
 
206
212
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
207
213
  def _clean(n: Optional[str]) -> Optional[str]:
214
+ """Normalize FASTQ read names by trimming read suffixes."""
208
215
  if n is None:
209
216
  return None
210
217
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
@@ -256,4 +263,4 @@ def concatenate_fastqs_to_bam(
256
263
  "paired_pairs_written": paired_pairs_written,
257
264
  "singletons_written": singletons_written,
258
265
  "barcodes": barcodes_in_order,
259
- }
266
+ }
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## converted_BAM_to_adata
2
4
 
3
5
  def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## count_aligned_reads
2
4
 
3
5
  # General
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## demux_and_index_BAM
2
4
 
3
5
  def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
2
4
  """
3
5
  Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## extract_mods
2
4
 
3
5
  def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified=True, modkit_summary=False, threads=None):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # extract_read_features_from_bam
2
4
 
3
5
  def extract_read_features_from_bam(bam_file_path):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # extract_read_lengths_from_bed
2
4
 
3
5
  def extract_read_lengths_from_bed(file_path):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # extract_readnames_from_BAM
2
4
 
3
5
  def extract_readnames_from_BAM(aligned_BAM):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
2
4
  """
3
5
  Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import numpy as np
2
4
  import gzip
3
5
  import os
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # get_chromosome_lengths
2
4
 
3
5
  def get_chromosome_lengths(fasta):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## get_native_references
2
4
 
3
5
  # Direct methylation specific
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import pysam
2
4
  from pathlib import Path
3
5
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## fasta_module
2
4
  from .. import readwrite
3
5
  # bioinformatic operations
@@ -1,12 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  # load_adata
2
4
  ######################################################################################################
3
- import .utils
5
+ # Archived helper; legacy imports removed for syntax compatibility.
4
6
  # File I/O
5
7
  import subprocess
6
8
  import gc
7
9
 
8
10
  # bioinformatic operations
9
- import .informatics_module
11
+ # import .informatics_module
10
12
 
11
13
  # User interface
12
14
  from tqdm import tqdm
@@ -513,4 +515,4 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
513
515
  print(f"Deleted file: {hdf}")
514
516
  except OSError as e:
515
517
  print(f"Error deleting file {hdf}: {e}")
516
- ######################################################################################################
518
+ ######################################################################################################
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## make_modbed
2
4
 
3
5
  # Direct SMF