smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,31 @@
1
- def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120, octamer_size=147, max_wiggle=40, device="cpu"):
1
+ from smftools.logging_utils import get_logger
2
+
3
+ logger = get_logger(__name__)
4
+
5
+
6
+ def refine_nucleosome_calls(
7
+ adata,
8
+ layer_name,
9
+ nan_mask_layer,
10
+ hexamer_size=120,
11
+ octamer_size=147,
12
+ max_wiggle=40,
13
+ device="cpu",
14
+ ):
15
+ """Refine nucleosome calls into hexamer/octamer layers.
16
+
17
+ Args:
18
+ adata: AnnData with nucleosome calls.
19
+ layer_name: Layer containing initial nucleosome calls.
20
+ nan_mask_layer: Layer indicating NaN regions.
21
+ hexamer_size: Size for hexamer placement.
22
+ octamer_size: Size for octamer placement.
23
+ max_wiggle: Max boundary expansion into NaNs.
24
+ device: Device specifier (unused; kept for API parity).
25
+
26
+ Returns:
27
+ Updated AnnData with hexamer/octamer layers.
28
+ """
2
29
  import numpy as np
3
30
 
4
31
  nucleosome_layer = adata.layers[layer_name]
@@ -31,7 +58,10 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
31
58
  break
32
59
  # Right
33
60
  for i in range(1, max_wiggle + 1):
34
- if end_idx + i < nucleosome_layer.shape[1] and nan_mask[read_idx, end_idx + i] == 1:
61
+ if (
62
+ end_idx + i < nucleosome_layer.shape[1]
63
+ and nan_mask[read_idx, end_idx + i] == 1
64
+ ):
35
65
  right_expand += 1
36
66
  else:
37
67
  break
@@ -40,26 +70,55 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
40
70
  expanded_end = end_idx + right_expand
41
71
 
42
72
  available_size = expanded_end - expanded_start
43
-
73
+
44
74
  # Octamer placement
45
75
  if available_size >= octamer_size:
46
76
  center = (expanded_start + expanded_end) // 2
47
77
  half_oct = octamer_size // 2
48
- octamer_layer[read_idx, center - half_oct: center - half_oct + octamer_size] = 1
78
+ octamer_layer[
79
+ read_idx, center - half_oct : center - half_oct + octamer_size
80
+ ] = 1
49
81
 
50
82
  # Hexamer placement
51
83
  elif available_size >= hexamer_size:
52
84
  center = (expanded_start + expanded_end) // 2
53
85
  half_hex = hexamer_size // 2
54
- hexamer_layer[read_idx, center - half_hex: center - half_hex + hexamer_size] = 1
86
+ hexamer_layer[
87
+ read_idx, center - half_hex : center - half_hex + hexamer_size
88
+ ] = 1
55
89
 
56
90
  adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
57
91
  adata.layers[f"{layer_name}_octamers"] = octamer_layer
58
92
 
59
- print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
93
+ logger.info("Added layers: %s_hexamers and %s_octamers", layer_name, layer_name)
60
94
  return adata
61
95
 
62
- def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
96
+
97
+ def infer_nucleosomes_in_large_bound(
98
+ adata,
99
+ large_bound_layer,
100
+ combined_nuc_layer,
101
+ nan_mask_layer,
102
+ nuc_size=147,
103
+ linker_size=50,
104
+ exclusion_buffer=30,
105
+ device="cpu",
106
+ ):
107
+ """Infer nucleosomes in large-bound regions while respecting exclusions.
108
+
109
+ Args:
110
+ adata: AnnData with bound regions and existing nucleosomes.
111
+ large_bound_layer: Layer marking large-bound segments.
112
+ combined_nuc_layer: Layer with existing nucleosome calls.
113
+ nan_mask_layer: Layer indicating NaN regions.
114
+ nuc_size: Nucleosome size in bp.
115
+ linker_size: Minimum linker spacing.
116
+ exclusion_buffer: Buffer to avoid nearby existing nucleosomes.
117
+ device: Device specifier (unused; kept for API parity).
118
+
119
+ Returns:
120
+ Updated AnnData with inferred nucleosome layer.
121
+ """
63
122
  import numpy as np
64
123
 
65
124
  large_bound = adata.layers[large_bound_layer]
@@ -82,23 +141,52 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
82
141
 
83
142
  # Adjust boundaries into flanking NaN regions without getting too close to existing nucleosomes
84
143
  left_expand = start_idx
85
- while left_expand > 0 and nan_mask[read_idx, left_expand - 1] == 1 and np.sum(existing_nucs[read_idx, max(0, left_expand - exclusion_buffer):left_expand]) == 0:
144
+ while (
145
+ left_expand > 0
146
+ and nan_mask[read_idx, left_expand - 1] == 1
147
+ and np.sum(
148
+ existing_nucs[
149
+ read_idx, max(0, left_expand - exclusion_buffer) : left_expand
150
+ ]
151
+ )
152
+ == 0
153
+ ):
86
154
  left_expand -= 1
87
155
 
88
156
  right_expand = end_idx
89
- while right_expand < row.shape[0] and nan_mask[read_idx, right_expand] == 1 and np.sum(existing_nucs[read_idx, right_expand:min(row.shape[0], right_expand + exclusion_buffer)]) == 0:
157
+ while (
158
+ right_expand < row.shape[0]
159
+ and nan_mask[read_idx, right_expand] == 1
160
+ and np.sum(
161
+ existing_nucs[
162
+ read_idx,
163
+ right_expand : min(row.shape[0], right_expand + exclusion_buffer),
164
+ ]
165
+ )
166
+ == 0
167
+ ):
90
168
  right_expand += 1
91
169
 
92
170
  # Phase nucleosomes with linker spacing only
93
171
  region = (left_expand, right_expand)
94
172
  pos_cursor = region[0]
95
173
  while pos_cursor + nuc_size <= region[1]:
96
- if np.all((existing_nucs[read_idx, pos_cursor - exclusion_buffer:pos_cursor + nuc_size + exclusion_buffer] == 0)):
97
- inferred_layer[read_idx, pos_cursor:pos_cursor + nuc_size] = 1
98
- pos_cursor += nuc_size + linker_size
174
+ if np.all(
175
+ (
176
+ existing_nucs[
177
+ read_idx,
178
+ pos_cursor - exclusion_buffer : pos_cursor
179
+ + nuc_size
180
+ + exclusion_buffer,
181
+ ]
182
+ == 0
183
+ )
184
+ ):
185
+ inferred_layer[read_idx, pos_cursor : pos_cursor + nuc_size] = 1
186
+ pos_cursor += nuc_size + linker_size
99
187
  else:
100
188
  pos_cursor += 1
101
189
 
102
190
  adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
103
- print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
104
- return adata
191
+ logger.info("Added layer: %s_phased_nucleosomes", large_bound_layer)
192
+ return adata
@@ -1,12 +1,35 @@
1
- from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
1
+ from .bam_functions import (
2
+ align_and_sort_BAM,
3
+ bam_qc,
4
+ concatenate_fastqs_to_bam,
5
+ count_aligned_reads,
6
+ demux_and_index_BAM,
7
+ extract_base_identities,
8
+ extract_read_features_from_bam,
9
+ extract_readnames_from_bam,
10
+ separate_bam_by_bc,
11
+ split_and_index_BAM,
12
+ )
2
13
  from .basecalling import canoncall, modcall
3
- from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
14
+ from .bed_functions import (
15
+ _bed_to_bigwig,
16
+ _plot_bed_histograms,
17
+ aligned_BAM_to_bed,
18
+ extract_read_lengths_from_bed,
19
+ )
4
20
  from .converted_BAM_to_adata import converted_BAM_to_adata
5
- from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
21
+ from .fasta_functions import (
22
+ find_conversion_sites,
23
+ generate_converted_FASTA,
24
+ get_chromosome_lengths,
25
+ get_native_references,
26
+ index_fasta,
27
+ subsample_fasta_from_bed,
28
+ )
6
29
  from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
7
- from .modkit_functions import extract_mods, make_modbed, modQC
8
30
  from .modkit_extract_to_adata import modkit_extract_to_adata
9
- from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
31
+ from .modkit_functions import extract_mods, make_modbed, modQC
32
+ from .ohe import ohe_batching, ohe_layers_decode, one_hot_decode, one_hot_encode
10
33
  from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
11
34
  from .run_multiqc import run_multiqc
12
35
 
@@ -16,5 +39,5 @@ __all__ = [
16
39
  "subsample_fasta_from_bed",
17
40
  "subsample_pod5",
18
41
  "fast5_to_pod5",
19
- "run_multiqc"
20
- ]
42
+ "run_multiqc",
43
+ ]
@@ -20,6 +20,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
20
20
  fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
21
21
 
22
22
  def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
23
+ """Sort a BAM file using pysam.
24
+
25
+ Args:
26
+ in_bam: Input BAM path.
27
+ out_bam: Output BAM path.
28
+ threads: Optional thread count.
29
+ """
23
30
  in_bam, out_bam = str(in_bam), str(out_bam)
24
31
  args = []
25
32
  if threads:
@@ -28,6 +35,12 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
28
35
  pysam.sort(*args)
29
36
 
30
37
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
38
+ """Index a BAM file using pysam.
39
+
40
+ Args:
41
+ bam_path: BAM path to index.
42
+ threads: Optional thread count.
43
+ """
31
44
  bam_path = str(bam_path)
32
45
  # pysam.index supports samtools-style args
33
46
  if threads:
@@ -123,4 +136,4 @@ def align_and_sort_BAM(fasta,
123
136
  # index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
124
137
  # else:
125
138
  # index_command = ["samtools", "index", aligned_sorted_output]
126
- # subprocess.run(index_command)
139
+ # subprocess.run(index_command)
@@ -35,6 +35,7 @@ def bam_qc(
35
35
  bam_files = [Path(b) for b in bam_files]
36
36
 
37
37
  def _has_index(p: Path) -> bool:
38
+ """Return True if a BAM/CRAM index exists for the path."""
38
39
  if p.suffix.lower() == ".bam":
39
40
  bai = p.with_suffix(p.suffix + ".bai")
40
41
  bai_alt = Path(str(p) + ".bai")
@@ -45,6 +46,7 @@ def bam_qc(
45
46
  return False
46
47
 
47
48
  def _ensure_index(p: Path) -> None:
49
+ """Ensure a BAM/CRAM index exists, creating one if needed."""
48
50
  if _has_index(p):
49
51
  return
50
52
  if HAVE_PYSAM:
@@ -55,6 +57,14 @@ def bam_qc(
55
57
  subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
56
58
 
57
59
  def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
60
+ """Run QC tasks for a single BAM file.
61
+
62
+ Args:
63
+ bam: Path to the BAM file.
64
+
65
+ Returns:
66
+ Tuple of (bam_path, list of (task_name, return_code)).
67
+ """
58
68
  # outputs + return (file, [(task_name, returncode)])
59
69
  results: List[Tuple[str, int]] = []
60
70
  base = bam.stem # filename without .bam
@@ -71,6 +81,7 @@ def bam_qc(
71
81
 
72
82
  # Choose runner per task
73
83
  def run_stats():
84
+ """Run stats collection for a BAM file."""
74
85
  if not stats:
75
86
  return
76
87
  if HAVE_PYSAM and hasattr(pysam, "stats"):
@@ -86,6 +97,7 @@ def bam_qc(
86
97
  raise RuntimeError(cp.stderr.decode(errors="replace"))
87
98
 
88
99
  def run_flagstat():
100
+ """Run flagstat collection for a BAM file."""
89
101
  if not flagstats:
90
102
  return
91
103
  if HAVE_PYSAM and hasattr(pysam, "flagstat"):
@@ -101,6 +113,7 @@ def bam_qc(
101
113
  raise RuntimeError(cp.stderr.decode(errors="replace"))
102
114
 
103
115
  def run_idxstats():
116
+ """Run idxstats collection for a BAM file."""
104
117
  if not idxstats:
105
118
  return
106
119
  if HAVE_PYSAM and hasattr(pysam, "idxstats"):
@@ -210,4 +223,4 @@ def bam_qc(
210
223
  # elif modality == 'direct':
211
224
  # pass
212
225
 
213
- # print("QC processing completed.")
226
+ # print("QC processing completed.")
@@ -60,6 +60,7 @@ def concatenate_fastqs_to_bam(
60
60
  return p.stem # fallback: remove last suffix only
61
61
 
62
62
  def _extract_barcode_from_filename(p: Path) -> str:
63
+ """Extract a barcode token from a FASTQ filename."""
63
64
  stem = _strip_fastq_ext(p)
64
65
  if "_" in stem:
65
66
  token = stem.split("_")[-1]
@@ -68,6 +69,7 @@ def concatenate_fastqs_to_bam(
68
69
  return stem
69
70
 
70
71
  def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
72
+ """Classify a FASTQ filename stem into (prefix, read_number)."""
71
73
  # return (prefix, readnum) if matches; else (None, None)
72
74
  patterns = [
73
75
  r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
@@ -80,6 +82,7 @@ def concatenate_fastqs_to_bam(
80
82
  return None, None
81
83
 
82
84
  def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
85
+ """Pair FASTQ files based on filename conventions."""
83
86
  pref_map: Dict[str, Dict[int, Path]] = {}
84
87
  unpaired: List[Path] = []
85
88
  for pth in paths:
@@ -101,6 +104,7 @@ def concatenate_fastqs_to_bam(
101
104
  return pairs, leftovers
102
105
 
103
106
  def _fastq_iter(p: Path):
107
+ """Yield FASTQ records using pysam.FastxFile."""
104
108
  # pysam.FastxFile handles compressed extensions transparently
105
109
  with pysam.FastxFile(str(p)) as fx:
106
110
  for rec in fx:
@@ -114,6 +118,7 @@ def concatenate_fastqs_to_bam(
114
118
  read1: bool,
115
119
  read2: bool,
116
120
  ) -> pysam.AlignedSegment:
121
+ """Construct an unaligned pysam.AlignedSegment."""
117
122
  a = pysam.AlignedSegment()
118
123
  a.query_name = name
119
124
  a.query_sequence = seq
@@ -136,6 +141,7 @@ def concatenate_fastqs_to_bam(
136
141
 
137
142
  # ---------- normalize inputs to Path ----------
138
143
  def _to_path_pair(x) -> Tuple[Path, Path]:
144
+ """Convert a tuple of path-like objects to Path instances."""
139
145
  a, b = x
140
146
  return Path(a), Path(b)
141
147
 
@@ -205,6 +211,7 @@ def concatenate_fastqs_to_bam(
205
211
 
206
212
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
207
213
  def _clean(n: Optional[str]) -> Optional[str]:
214
+ """Normalize FASTQ read names by trimming read suffixes."""
208
215
  if n is None:
209
216
  return None
210
217
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
@@ -256,4 +263,4 @@ def concatenate_fastqs_to_bam(
256
263
  "paired_pairs_written": paired_pairs_written,
257
264
  "singletons_written": singletons_written,
258
265
  "barcodes": barcodes_in_order,
259
- }
266
+ }
@@ -1,12 +1,12 @@
1
1
  # load_adata
2
2
  ######################################################################################################
3
- import .utils
3
+ # Archived helper; legacy imports removed for syntax compatibility.
4
4
  # File I/O
5
5
  import subprocess
6
6
  import gc
7
7
 
8
8
  # bioinformatic operations
9
- import .informatics_module
9
+ # import .informatics_module
10
10
 
11
11
  # User interface
12
12
  from tqdm import tqdm
@@ -513,4 +513,4 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
513
513
  print(f"Deleted file: {hdf}")
514
514
  except OSError as e:
515
515
  print(f"Error deleting file {hdf}: {e}")
516
- ######################################################################################################
516
+ ######################################################################################################
@@ -86,6 +86,7 @@ def plot_bed_histograms(
86
86
 
87
87
  # Clip helper for hist tails
88
88
  def _clip_series(s, q=(0.0, 0.995)):
89
+ """Clip a Series to quantile bounds for plotting."""
89
90
  if q is None:
90
91
  return s.to_numpy()
91
92
  lo = s.quantile(q[0]) if q[0] is not None else s.min()
@@ -109,6 +110,7 @@ def plot_bed_histograms(
109
110
 
110
111
  # Pagination
111
112
  def _sanitize(name: str) -> str:
113
+ """Sanitize a string for use in filenames."""
112
114
  return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
113
115
 
114
116
  cols_per_fig = 4 if include_mapq_quality else 2
@@ -247,4 +249,4 @@ def plot_bed_histograms(
247
249
  # plt.grid(True)
248
250
  # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
249
251
  # plt.savefig(save_name)
250
- # plt.close()
252
+ # plt.close()
@@ -2,6 +2,12 @@ import pysam
2
2
  import sys
3
3
 
4
4
  def extract_reads(bam_file_path, num_reads=10):
5
+ """Print sequences for the first N reads in a BAM file.
6
+
7
+ Args:
8
+ bam_file_path: Path to BAM file.
9
+ num_reads: Number of reads to print.
10
+ """
5
11
  # Open the BAM file
6
12
  bam_file = pysam.AlignmentFile(bam_file_path, "rb")
7
13
 
@@ -26,4 +32,4 @@ if __name__ == "__main__":
26
32
  bam_file_path = sys.argv[1]
27
33
 
28
34
  # Call the function to extract the first 10 reads
29
- extract_reads(bam_file_path)
35
+ extract_reads(bam_file_path)