smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,143 @@
1
+ import numpy as np
2
+ import scipy.sparse as sp
3
+
4
+ def append_binary_layer_by_base_context(
5
+ adata,
6
+ reference_column: str,
7
+ smf_modality: str = "conversion",
8
+ verbose: bool = True,
9
+ uns_flag: str = "binary_layers_by_base_context_added",
10
+ bypass: bool = False,
11
+ force_redo: bool = False
12
+ ):
13
+ """
14
+ Build per-reference C/G-site masked layers:
15
+ - GpC_site_binary
16
+ - CpG_site_binary
17
+ - GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
18
+ - any_C_site_binary
19
+ - other_C_site_binary
20
+
21
+ Behavior:
22
+ - If X is sparse it will be converted to dense for these layers (keeps original adata.X untouched).
23
+ - Missing var columns are warned about but do not crash.
24
+ - Masked positions are filled with np.nan to make masked vs unmasked explicit.
25
+ - Requires append_base_context to be run first
26
+ """
27
+
28
+ # Only run if not already performed
29
+ already = bool(adata.uns.get(uns_flag, False))
30
+ if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
31
+ # QC already performed; nothing to do
32
+ return adata
33
+
34
+ # check inputs
35
+ if reference_column not in adata.obs.columns:
36
+ raise KeyError(f"reference_column '{reference_column}' not found in adata.obs")
37
+
38
+ # modality flag (kept for your potential use)
39
+ if smf_modality != "direct":
40
+ if smf_modality == "conversion":
41
+ deaminase = False
42
+ else:
43
+ deaminase = True
44
+ else:
45
+ deaminase = None # unused but preserved
46
+
47
+ # expected per-reference var column names
48
+ references = adata.obs[reference_column].astype("category").cat.categories
49
+ reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
50
+ reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
51
+ reference_to_c_column = {ref: f"{ref}_any_C_site" for ref in references}
52
+ reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
53
+
54
+ # verify var columns exist and build boolean masks per ref (len = n_vars)
55
+ n_obs, n_vars = adata.shape
56
+ def _col_mask_or_warn(colname):
57
+ if colname not in adata.var.columns:
58
+ if verbose:
59
+ print(f"Warning: var column '{colname}' not found; treating as all-False mask.")
60
+ return np.zeros(n_vars, dtype=bool)
61
+ vals = adata.var[colname].values
62
+ # coerce truthiness
63
+ try:
64
+ return vals.astype(bool)
65
+ except Exception:
66
+ return np.array([bool(v) for v in vals], dtype=bool)
67
+
68
+ gpc_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_gpc_column.items()}
69
+ cpg_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_cpg_column.items()}
70
+ c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_c_column.items()}
71
+ other_c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_other_c_column.items()}
72
+
73
+ # prepare X as dense float32 for layer filling (we leave adata.X untouched)
74
+ X = adata.X
75
+ if sp.issparse(X):
76
+ if verbose:
77
+ print("Converting sparse X to dense array for layer construction (temporary).")
78
+ X = X.toarray()
79
+ X = np.asarray(X, dtype=np.float32)
80
+
81
+ # initialize masked arrays filled with NaN
82
+ masked_gpc = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
83
+ masked_cpg = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
84
+ masked_any_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
85
+ masked_other_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
86
+
87
+ # fill row-blocks per reference (this avoids creating a full row×var boolean mask)
88
+ obs_ref_series = adata.obs[reference_column]
89
+ for ref in references:
90
+ rows_mask = (obs_ref_series.values == ref)
91
+ if not rows_mask.any():
92
+ continue
93
+ row_idx = np.nonzero(rows_mask)[0] # integer indices of rows for this ref
94
+
95
+ # column masks for this ref
96
+ gpc_cols = gpc_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
97
+ cpg_cols = cpg_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
98
+ c_cols = c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
99
+ other_c_cols = other_c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
100
+
101
+ if gpc_cols.any():
102
+ # assign only the submatrix (rows x selected cols)
103
+ masked_gpc[np.ix_(row_idx, gpc_cols)] = X[np.ix_(row_idx, gpc_cols)]
104
+ if cpg_cols.any():
105
+ masked_cpg[np.ix_(row_idx, cpg_cols)] = X[np.ix_(row_idx, cpg_cols)]
106
+ if c_cols.any():
107
+ masked_any_c[np.ix_(row_idx, c_cols)] = X[np.ix_(row_idx, c_cols)]
108
+ if other_c_cols.any():
109
+ masked_other_c[np.ix_(row_idx, other_c_cols)] = X[np.ix_(row_idx, other_c_cols)]
110
+
111
+ # Build combined layer:
112
+ # - numeric_sum: sum where either exists, NaN where neither exists
113
+ # we compute numeric sum but preserve NaN where both are NaN
114
+ gpc_nan = np.isnan(masked_gpc)
115
+ cpg_nan = np.isnan(masked_cpg)
116
+ combined_sum = np.nan_to_num(masked_gpc, nan=0.0) + np.nan_to_num(masked_cpg, nan=0.0)
117
+ both_nan = gpc_nan & cpg_nan
118
+ combined_sum[both_nan] = np.nan
119
+
120
+ # Alternative: if you prefer a boolean OR combined layer, uncomment:
121
+ # combined_bool = (~gpc_nan & (masked_gpc != 0)) | (~cpg_nan & (masked_cpg != 0))
122
+ # combined_layer = combined_bool.astype(np.float32)
123
+
124
+ adata.layers['GpC_site_binary'] = masked_gpc
125
+ adata.layers['CpG_site_binary'] = masked_cpg
126
+ adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
127
+ adata.layers['any_C_site_binary'] = masked_any_c
128
+ adata.layers['other_C_site_binary'] = masked_other_c
129
+
130
+ if verbose:
131
+ def _filled_positions(arr):
132
+ return int(np.sum(~np.isnan(arr)))
133
+ print("Layer build summary (non-NaN cell counts):")
134
+ print(f" GpC: {_filled_positions(masked_gpc)}")
135
+ print(f" CpG: {_filled_positions(masked_cpg)}")
136
+ print(f" GpC+CpG combined: {_filled_positions(combined_sum)}")
137
+ print(f" any_C: {_filled_positions(masked_any_c)}")
138
+ print(f" other_C: {_filled_positions(masked_other_c)}")
139
+
140
+ # mark as done
141
+ adata.uns[uns_flag] = True
142
+
143
+ return adata
@@ -0,0 +1,146 @@
1
+ ## mark_duplicates
2
+
3
+ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
4
+ """
5
+ Marks duplicates in the adata object.
6
+
7
+ Parameters:
8
+ adata (AnnData): An adata object.
9
+ layers (list): A list of strings representing the layers to use.
10
+ obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
11
+ sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
+ method (str): method to use for calculating the distance metric
13
+ distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
14
+
15
+ Returns:
16
+ None
17
+ """
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import matplotlib.pyplot as plt
22
+ from scipy.signal import find_peaks
23
+ import networkx as nx
24
+ from .binary_layers_to_ohe import binary_layers_to_ohe
25
+ from .calculate_pairwise_differences import calculate_pairwise_differences
26
+ from .min_non_diagonal import min_non_diagonal
27
+
28
+ categories = adata.obs[obs_column].cat.categories
29
+ sample_names = adata.obs[sample_col].cat.categories
30
+
31
+ # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
32
+ adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
33
+ cat_sample_dict = {}
34
+ for cat in categories:
35
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
36
+ for sample in sample_names:
37
+ sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
38
+ sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
39
+ # Encode sequencing reads as a one-hot-encodings
40
+ print(f'One-hot encoding reads from {sample} on {cat}')
41
+ cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
42
+ # Unpack the read names and one hot encodings into lists
43
+ read_names = []
44
+ ohe_list = []
45
+ for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
46
+ read_names.append(read_name)
47
+ ohe_list.append(ohe)
48
+ # Calculate the pairwise hamming distances
49
+ if method == 'N_masked_distances':
50
+ print(f'Calculating N_masked_distances for {sample} on {cat} allele')
51
+ distance_matrix = calculate_pairwise_differences(ohe_list)
52
+ else:
53
+ print(f'{method} for calculating differences is not available')
54
+ n_reads = distance_matrix.shape[0]
55
+ # Load the hamming matrix into a dataframe with index and column names as the read_names
56
+ distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
57
+ cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
58
+
59
+ if n_reads > 1:
60
+ # Calculate the minimum non-self distance for every read in the reference and sample
61
+ min_distance_values = min_non_diagonal(distance_matrix)
62
+ min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
63
+ adata.obs.update(min_distance_df)
64
+
65
+ if cat in distance_thresholds:
66
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
67
+ else: # eventually this should be written to use known PCR duplicate controls for thresholding.
68
+ # Generate a histogram of minimum non-self distances for each read
69
+ if n_reads > 3:
70
+ n_bins = n_reads // 4
71
+ else:
72
+ n_bins = 1
73
+ min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
74
+ # Normalize the max value in any histogram bin to 1
75
+ normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
76
+ # Extract the bin index of peak centers in the histogram
77
+ try:
78
+ peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
79
+ first_peak_index = peak_centers[0]
80
+ offset_index = first_peak_index-1
81
+ # Use the distance corresponding to the first peak as the threshold distance in graph construction
82
+ first_peak_distance = min_distance_bins[1][first_peak_index]
83
+ offset_distance = min_distance_bins[1][offset_index]
84
+ except:
85
+ offset_distance = normalized_min_distance_counts[0]
86
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
87
+ else:
88
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
89
+
90
+ ## Detect likely duplicate reads and mark them in the adata object.
91
+ adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
92
+ adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
93
+ adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
94
+
95
+ for cat in categories:
96
+ for sample in sample_names:
97
+ distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
98
+ read_names = distance_df.index
99
+ distance_matrix = distance_df.values
100
+ n_reads = distance_matrix.shape[0]
101
+ distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
102
+ # Initialize the read distance graph
103
+ G = nx.Graph()
104
+ # Add each read as a node to the graph
105
+ G.add_nodes_from(range(n_reads))
106
+ # Add edges based on the threshold
107
+ for i in range(n_reads):
108
+ for j in range(i + 1, n_reads):
109
+ if distance_matrix[i, j] <= distance_threshold:
110
+ G.add_edge(i, j)
111
+ # Determine distinct clusters using connected components
112
+ clusters = list(nx.connected_components(G))
113
+ clusters = [list(cluster) for cluster in clusters]
114
+ # Get the number of clusters
115
+ cluster_count = len(clusters)
116
+ if n_reads > 0:
117
+ fraction_unique = cluster_count / n_reads
118
+ else:
119
+ fraction_unique = 0
120
+ adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
121
+ adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
122
+ # Update the adata object
123
+ read_cluster_map = {}
124
+ read_duplicate_map = {}
125
+ read_keep_map = {}
126
+ for i, cluster in enumerate(clusters):
127
+ for j, read_index in enumerate(cluster):
128
+ read_name = read_names[read_index]
129
+ read_cluster_map[read_name] = i
130
+ if len(cluster) > 1:
131
+ read_duplicate_map[read_name] = True
132
+ if j == 0:
133
+ read_keep_map[read_name] = True
134
+ else:
135
+ read_keep_map[read_name] = False
136
+ elif len(cluster) == 1:
137
+ read_duplicate_map[read_name] = False
138
+ read_keep_map[read_name] = True
139
+ cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
140
+ duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
141
+ keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
142
+ df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
143
+ adata.obs.update(df_combined)
144
+ adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
145
+ adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
146
+ print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')