smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -1,4 +1,15 @@
1
1
  ## readwrite ##
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import Union, Iterable
6
+
7
+ from pathlib import Path
8
+ from typing import Iterable, Sequence, Optional
9
+
10
+ import warnings
11
+ import pandas as pd
12
+ import anndata as ad
2
13
 
3
14
  ######################################################################################################
4
15
  ## Datetime functionality
@@ -21,6 +32,101 @@ def time_string():
21
32
  return current_time.strftime("%H:%M:%S")
22
33
  ######################################################################################################
23
34
 
35
+ ######################################################################################################
36
+ ## General file and directory handling
37
+ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
38
+ """
39
+ Create one or multiple directories.
40
+
41
+ Parameters
42
+ ----------
43
+ directories : str | Path | list/iterable of str | Path
44
+ Paths of directories to create. If a file path is passed,
45
+ the parent directory is created.
46
+
47
+ Returns
48
+ -------
49
+ None
50
+ """
51
+
52
+ # allow user to pass a single string/Path
53
+ if isinstance(directories, (str, Path)):
54
+ directories = [directories]
55
+
56
+ for d in directories:
57
+ p = Path(d)
58
+
59
+ # If someone passes in a file path, make its parent
60
+ if p.suffix: # p.suffix != "" means it's a file
61
+ p = p.parent
62
+
63
+ p.mkdir(parents=True, exist_ok=True)
64
+
65
+ def add_or_update_column_in_csv(
66
+ csv_path: str | Path,
67
+ column_name: str,
68
+ values,
69
+ index: bool = False,
70
+ ):
71
+ """
72
+ Add (or overwrite) a column in a CSV file.
73
+ If the CSV does not exist, create it containing only that column.
74
+
75
+ Parameters
76
+ ----------
77
+ csv_path : str | Path
78
+ Path to the CSV file.
79
+ column_name : str
80
+ Name of the column to add or update.
81
+ values : list | scalar | callable
82
+ - If list/Series: must match the number of rows.
83
+ - If scalar: broadcast to all rows (or single-row CSV if new file).
84
+ - If callable(df): function should return the column values based on df.
85
+ index : bool
86
+ Whether to write the pandas index into the CSV. Default False.
87
+
88
+ Returns
89
+ -------
90
+ pd.DataFrame : the updated DataFrame.
91
+ """
92
+ csv_path = Path(csv_path)
93
+ csv_path.parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ # Case 1 — CSV does not exist → create it
96
+ if not csv_path.exists():
97
+ if hasattr(values, "__len__") and not isinstance(values, str):
98
+ df = pd.DataFrame({column_name: list(values)})
99
+ else:
100
+ df = pd.DataFrame({column_name: [values]})
101
+ df.to_csv(csv_path, index=index)
102
+ return df
103
+
104
+ # Case 2 — CSV exists → load + modify
105
+ df = pd.read_csv(csv_path)
106
+
107
+ # If values is callable, call it with df
108
+ if callable(values):
109
+ values = values(df)
110
+
111
+ # Broadcast scalar
112
+ if not hasattr(values, "__len__") or isinstance(values, str):
113
+ df[column_name] = values
114
+ df.to_csv(csv_path, index=index)
115
+ return df
116
+
117
+ # Sequence case: lengths must match
118
+ if len(values) != len(df):
119
+ raise ValueError(
120
+ f"Length mismatch: CSV has {len(df)} rows "
121
+ f"but values has {len(values)} entries."
122
+ )
123
+
124
+ df[column_name] = list(values)
125
+ df.to_csv(csv_path, index=index)
126
+ return df
127
+
128
+ ######################################################################################################
129
+
24
130
  ######################################################################################################
25
131
  ## Numpy, Pandas, Anndata functionality
26
132
 
@@ -62,7 +168,6 @@ def adata_to_df(adata, layer=None):
62
168
 
63
169
  return df
64
170
 
65
-
66
171
  def save_matrix(matrix, save_name):
67
172
  """
68
173
  Input: A numpy matrix and a save_name
@@ -71,70 +176,173 @@ def save_matrix(matrix, save_name):
71
176
  import numpy as np
72
177
  np.savetxt(f'{save_name}.txt', matrix)
73
178
 
74
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
179
+ def concatenate_h5ads(
180
+ output_path: str | Path,
181
+ *,
182
+ input_dir: str | Path | None = None,
183
+ csv_path: str | Path | None = None,
184
+ csv_column: str = "h5ad_path",
185
+ file_suffixes: Sequence[str] = (".h5ad", ".h5ad.gz"),
186
+ delete_inputs: bool = False,
187
+ restore_backups: bool = True,
188
+ ) -> Path:
75
189
  """
76
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
77
- Input: an output file path relative to the directory in which the function is called
190
+ Concatenate multiple .h5ad files into one AnnData and write it safely.
191
+
192
+ Two input modes (choose ONE):
193
+ 1) Directory mode: use all *.h5ad / *.h5ad.gz in `input_dir`.
194
+ 2) CSV mode: use file paths from column `csv_column` in `csv_path`.
195
+
196
+ Parameters
197
+ ----------
198
+ output_path
199
+ Path to the final concatenated .h5ad (can be .h5ad or .h5ad.gz).
200
+ input_dir
201
+ Directory containing .h5ad files to concatenate. If None and csv_path
202
+ is also None, defaults to the current working directory.
203
+ csv_path
204
+ Path to a CSV containing file paths to concatenate (in column `csv_column`).
205
+ csv_column
206
+ Name of the column in the CSV containing .h5ad paths.
207
+ file_suffixes
208
+ Tuple of allowed suffixes (default: (".h5ad", ".h5ad.gz")).
209
+ delete_inputs
210
+ If True, delete the input .h5ad files after successful write of output.
211
+ restore_backups
212
+ Passed through to `safe_read_h5ad(restore_backups=...)`.
213
+
214
+ Returns
215
+ -------
216
+ Path
217
+ The path to the written concatenated .h5ad file.
218
+
219
+ Raises
220
+ ------
221
+ ValueError
222
+ If both `input_dir` and `csv_path` are provided, or none contain files.
223
+ FileNotFoundError
224
+ If specified CSV or directory does not exist.
78
225
  """
79
- import os
80
- import anndata as ad
81
- # Runtime warnings
82
- import warnings
83
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
84
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
85
-
86
- # List all files in the directory
87
- files = os.listdir(os.getcwd())
88
- # get current working directory
89
- cwd = os.getcwd()
90
- suffix = file_suffix
91
- # Filter file names that contain the search string in their filename and keep them in a list
92
- hdfs = [hdf for hdf in files if suffix in hdf]
93
- # Sort file list by names and print the list of file names
94
- hdfs.sort()
95
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
96
- # Iterate over all of the hdf5 files and concatenate them.
97
- final_adata = None
98
- for hdf in hdfs:
99
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
100
- temp_adata = ad.read_h5ad(hdf)
101
- if final_adata:
102
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
103
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
104
- else:
105
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
106
- final_adata = temp_adata
107
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
108
- final_adata.write_h5ad(output_file, compression='gzip')
109
226
 
110
- # Delete the individual h5ad files and only keep the final concatenated file
227
+ # ------------------------------------------------------------------
228
+ # Setup and input resolution
229
+ # ------------------------------------------------------------------
230
+ output_path = Path(output_path)
231
+
232
+ if input_dir is not None and csv_path is not None:
233
+ raise ValueError("Provide either `input_dir` OR `csv_path`, not both.")
234
+
235
+ if csv_path is None:
236
+ # Directory mode
237
+ input_dir = Path(input_dir) if input_dir is not None else Path.cwd()
238
+ if not input_dir.exists():
239
+ raise FileNotFoundError(f"Input directory does not exist: {input_dir}")
240
+ if not input_dir.is_dir():
241
+ raise ValueError(f"input_dir is not a directory: {input_dir}")
242
+
243
+ # collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
244
+ suffixes_lower = tuple(s.lower() for s in file_suffixes)
245
+ h5_paths = sorted(
246
+ p for p in input_dir.iterdir()
247
+ if p.is_file() and p.suffix.lower() in suffixes_lower
248
+ )
249
+
250
+ else:
251
+ # CSV mode
252
+ csv_path = Path(csv_path)
253
+ if not csv_path.exists():
254
+ raise FileNotFoundError(f"CSV path does not exist: {csv_path}")
255
+
256
+ df = pd.read_csv(csv_path, dtype=str)
257
+ if csv_column not in df.columns:
258
+ raise ValueError(
259
+ f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
260
+ )
261
+ paths = df[csv_column].dropna().astype(str).tolist()
262
+ if not paths:
263
+ raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
264
+
265
+ h5_paths = [Path(p).expanduser() for p in paths]
266
+
267
+ if not h5_paths:
268
+ raise ValueError("No input .h5ad files found to concatenate.")
269
+
270
+ # Ensure directory for output exists
271
+ output_path.parent.mkdir(parents=True, exist_ok=True)
272
+
273
+ # ------------------------------------------------------------------
274
+ # Concatenate
275
+ # ------------------------------------------------------------------
276
+ warnings.filterwarnings("ignore", category=UserWarning, module="anndata")
277
+ warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
278
+
279
+ print(f"{time_string()}: Found {len(h5_paths)} input h5ad files:")
280
+ for p in h5_paths:
281
+ print(f" - {p}")
282
+
283
+ final_adata: Optional[ad.AnnData] = None
284
+
285
+ for p in h5_paths:
286
+ print(f"{time_string()}: Reading {p}")
287
+ temp_adata, read_report = safe_read_h5ad(p, restore_backups=restore_backups)
288
+
289
+ if final_adata is None:
290
+ print(f"{time_string()}: Initializing final AnnData with {p}")
291
+ final_adata = temp_adata
292
+ else:
293
+ print(f"{time_string()}: Concatenating {p} into final AnnData")
294
+ final_adata = ad.concat(
295
+ [final_adata, temp_adata],
296
+ join="outer",
297
+ merge='unique',
298
+ uns_merge='unique',
299
+ index_unique=None,
300
+ )
301
+
302
+ if final_adata is None:
303
+ raise RuntimeError("Unexpected: no AnnData objects loaded.")
304
+
305
+ print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
306
+ safe_write_h5ad(final_adata, output_path, backup=restore_backups)
307
+
308
+ # ------------------------------------------------------------------
309
+ # Optional cleanup (delete inputs)
310
+ # ------------------------------------------------------------------
111
311
  if delete_inputs:
112
- files = os.listdir(os.getcwd())
113
- hdfs = [hdf for hdf in files if suffix in hdf]
114
- if output_file in hdfs:
115
- hdfs.remove(output_file)
116
- # Iterate over the files and delete them
117
- for hdf in hdfs:
118
- try:
119
- os.remove(hdf)
120
- print(f"Deleted file: {hdf}")
121
- except OSError as e:
122
- print(f"Error deleting file {hdf}: {e}")
312
+ out_resolved = output_path.resolve()
313
+ for p in h5_paths:
314
+ try:
315
+ # Don't delete the output file if it happens to be in the list
316
+ if p.resolve() == out_resolved:
317
+ continue
318
+ if p.exists():
319
+ p.unlink()
320
+ print(f"Deleted input file: {p}")
321
+ except OSError as e:
322
+ print(f"Error deleting file {p}: {e}")
123
323
  else:
124
- print('Keeping input files')
324
+ print("Keeping input files.")
125
325
 
126
- def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./uns_backups", verbose=True):
326
+ return output_path
327
+
328
+ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
127
329
  """
128
330
  Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
129
331
 
130
332
  Returns a report dict and prints a summary of what was converted/backed up/skipped.
131
333
  """
132
334
  import os, json, pickle
335
+ from pathlib import Path
133
336
  import numpy as np
134
337
  import pandas as pd
135
338
  import warnings
136
339
  import anndata as _ad
137
340
 
341
+ path = Path(path)
342
+
343
+ if not backup_dir:
344
+ backup_dir = path.parent / str(path.name).split(".")[0]
345
+
138
346
  os.makedirs(backup_dir, exist_ok=True)
139
347
 
140
348
  # report structure
@@ -155,7 +363,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=".
155
363
 
156
364
  def _backup(obj, name):
157
365
  """Pickle obj to backup_dir/name.pkl and return filename (or None)."""
158
- fname = os.path.join(backup_dir, f"{name}.pkl")
366
+ fname = backup_dir / f"{name}.pkl"
159
367
  try:
160
368
  with open(fname, "wb") as fh:
161
369
  pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
@@ -516,7 +724,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=".
516
724
  print("=== end report ===\n")
517
725
  return report
518
726
 
519
- def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
727
+ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
520
728
  """
521
729
  Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
522
730
  from the backup_dir produced during save.
@@ -545,12 +753,18 @@ def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_ca
545
753
  A report describing restored items, parsed JSON keys, and any failures.
546
754
  """
547
755
  import os
756
+ from pathlib import Path
548
757
  import json
549
758
  import pickle
550
759
  import numpy as np
551
760
  import pandas as pd
552
761
  import anndata as _ad
553
762
 
763
+ path = Path(path)
764
+
765
+ if not backup_dir:
766
+ backup_dir = path.parent / str(path.name).split(".")[0]
767
+
554
768
  report = {
555
769
  "restored_obs_columns": [],
556
770
  "restored_var_columns": [],
@@ -574,7 +788,6 @@ def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_ca
574
788
  raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
575
789
 
576
790
  # Ensure backup_dir exists (may be relative to cwd)
577
- backup_dir = os.path.abspath(backup_dir)
578
791
  if verbose:
579
792
  print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
580
793
 
@@ -594,8 +807,8 @@ def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_ca
594
807
  # 2) Restore obs columns
595
808
  for col in list(adata.obs.columns):
596
809
  # Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
597
- bname1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
598
- bname2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
810
+ bname1 = backup_dir / f"obs.{col}_backup.pkl"
811
+ bname2 = backup_dir / f"obs.{col}_categorical_backup.pkl"
599
812
  restored = False
600
813
 
601
814
  if restore_backups:
@@ -869,93 +1082,6 @@ def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_ca
869
1082
 
870
1083
  return adata, report
871
1084
 
872
-
873
- # def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./", verbose=True):
874
- # """
875
- # Saves an AnnData object safely by omitting problematic columns from .obs and .var.
876
-
877
- # Parameters:
878
- # adata (AnnData): The AnnData object to save.
879
- # path (str): Output .h5ad file path.
880
- # compression (str): Compression method for h5ad file.
881
- # backup (bool): If True, saves problematic columns to CSV files.
882
- # backup_dir (str): Directory to store backups if backup=True.
883
- # """
884
- # import anndata as ad
885
- # import pandas as pd
886
- # import os
887
- # import numpy as np
888
- # import json
889
-
890
- # os.makedirs(backup_dir, exist_ok=True)
891
-
892
- # def filter_df(df, df_name):
893
- # bad_cols = []
894
- # for col in df.columns:
895
- # if df[col].dtype == 'object':
896
- # if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
897
- # bad_cols.append(col)
898
- # elif pd.api.types.is_categorical_dtype(df[col]):
899
- # if not all(isinstance(x, (str, type(None))) for x in df[col].cat.categories):
900
- # bad_cols.append(col)
901
- # if bad_cols and verbose:
902
- # print(f"Skipping columns from {df_name}: {bad_cols}")
903
- # if backup and bad_cols:
904
- # df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
905
- # if verbose:
906
- # print(f"Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
907
- # return df.drop(columns=bad_cols)
908
-
909
- # def is_serializable(val):
910
- # try:
911
- # json.dumps(val)
912
- # return True
913
- # except (TypeError, OverflowError):
914
- # return False
915
-
916
- # def clean_uns(uns_dict):
917
- # clean_uns = {}
918
- # bad_keys = []
919
- # for k, v in uns_dict.items():
920
- # if isinstance(v, (str, int, float, type(None), list, np.ndarray, pd.DataFrame, dict)):
921
- # clean_uns[k] = v
922
- # elif is_serializable(v):
923
- # clean_uns[k] = v
924
- # else:
925
- # bad_keys.append(k)
926
- # if backup:
927
- # try:
928
- # with open(os.path.join(backup_dir, f"uns_{k}_backup.txt"), "w") as f:
929
- # f.write(str(v))
930
- # except Exception:
931
- # pass
932
- # if bad_keys and verbose:
933
- # print(f"Skipping entries from .uns: {bad_keys}")
934
- # return clean_uns
935
-
936
- # # Clean obs and var and uns
937
- # obs_clean = filter_df(adata.obs, "obs")
938
- # var_clean = filter_df(adata.var, "var")
939
- # uns_clean = clean_uns(adata.uns)
940
-
941
- # # Save clean version
942
- # adata_copy = ad.AnnData(
943
- # X=adata.X,
944
- # obs=obs_clean,
945
- # var=var_clean,
946
- # layers=adata.layers,
947
- # uns=uns_clean,
948
- # obsm=adata.obsm,
949
- # varm=adata.varm
950
- # )
951
-
952
- # adata_copy.obs_names = adata_copy.obs_names.astype(str)
953
- # adata_copy.var_names = adata_copy.var_names.astype(str)
954
-
955
- # adata_copy.write_h5ad(path, compression=compression)
956
-
957
- # print(f"Saved safely to {path}")
958
-
959
1085
  def merge_barcoded_anndatas_core(adata_single, adata_double):
960
1086
  import numpy as np
961
1087
  import anndata as ad
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
6
  Project-URL: Documentation, https://smftools.readthedocs.io/
@@ -43,7 +43,7 @@ Classifier: Programming Language :: Python :: 3.11
43
43
  Classifier: Programming Language :: Python :: 3.12
44
44
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
45
45
  Classifier: Topic :: Scientific/Engineering :: Visualization
46
- Requires-Python: >=3.9
46
+ Requires-Python: <3.13,>=3.9
47
47
  Requires-Dist: anndata>=0.10.0
48
48
  Requires-Dist: biopython>=1.79
49
49
  Requires-Dist: captum
@@ -59,7 +59,8 @@ Requires-Dist: numpy<2,>=1.22.0
59
59
  Requires-Dist: omegaconf
60
60
  Requires-Dist: pandas>=1.4.2
61
61
  Requires-Dist: pod5>=0.1.21
62
- Requires-Dist: pomegranate>=1.0.0
62
+ Requires-Dist: pybedtools>=0.12.0
63
+ Requires-Dist: pybigwig>=0.3.24
63
64
  Requires-Dist: pyfaidx>=0.8.0
64
65
  Requires-Dist: pysam>=0.19.1
65
66
  Requires-Dist: scanpy>=1.9
@@ -102,12 +103,9 @@ While most genomic data structures handle low-coverage data (<100X) along large
102
103
 
103
104
  ## Dependencies
104
105
  The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
105
- 1) [Dorado](https://github.com/nanoporetech/dorado) -> For standard/modified basecalling and alignment. Can be attained by downloading and configuring nanopore MinKnow software.
106
- 2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
107
- 3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
108
- 4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
109
- 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
110
- 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
106
+ 1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
107
+ 2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
108
+ 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files.
111
109
 
112
110
  ## Modules
113
111
  ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
@@ -122,6 +120,9 @@ The following CLI tools need to be installed and configured before using the inf
122
120
 
123
121
  ## Announcements
124
122
 
123
+ ### 11/05/25 - Version 0.2.1 is available through PyPI
124
+ Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
125
+
125
126
  ### 05/29/25 - Version 0.1.6 is available through PyPI.
126
127
  Informatics, preprocessing, tools, plotting modules have core functionality that is approaching stability on MacOS(Intel/Silicon) and Linux(Ubuntu). I will work on improving documentation/tutorials shortly. The base PyTorch/Scikit-Learn ML-infrastructure is going through some organizational changes to work with PyTorch Lightning, Hydra, and WanDB to facilitate organizational scaling, multi-device usage, and logging.
127
128