PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/machine_learning/models/transformer.py ADDED Viewed

@@ -0,0 +1,303 @@
+import torch
+import torch.nn as nn
+from .base import BaseTorchModel
+from .positional import PositionalEncoding
+from ..utils.grl import grad_reverse
+import numpy as np
+class TransformerEncoderLayerWithAttn(nn.TransformerEncoderLayer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, src, src_mask=None, is_causal=False, src_key_padding_mask=None):
+        self_attn_output, attn_weights = self.self_attn(
+            src, src, src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask,
+            need_weights=True,
+            average_attn_weights=False,  # preserve [B, num_heads, S, S]
+            is_causal=is_causal
+        )
+        src = src + self.dropout1(self_attn_output)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        # Save attention weights to module
+        self.attn_weights = attn_weights  # Save to layer
+        return src
+class BaseTransformer(BaseTorchModel):
+    def __init__(self,
+                 input_dim=1,
+                 model_dim=64,
+                 num_heads=4,
+                 num_layers=2,
+                 dropout=0.2,
+                 seq_len=None,
+                 use_learnable_pos=False,
+                 use_cls_token=True,
+                 **kwargs):
+        super().__init__(**kwargs)
+        # Input FC layer to map D_input to D_model
+        self.model_dim = model_dim
+        self.input_fc = nn.Linear(input_dim, model_dim)
+        self.ff_dim = model_dim * 4
+        self.dropout = dropout
+        self.use_cls_token = use_cls_token
+        self.attn_weights = []
+        self.attn_grads = []
+        if use_learnable_pos:
+            assert seq_len is not None, "Must provide seq_len if use_learnable_pos=True"
+            self.pos_embed = nn.Parameter(torch.randn(seq_len + (1 if use_cls_token else 0), model_dim))
+            self.pos_encoder = None
+        else:
+            self.pos_encoder = PositionalEncoding(model_dim)
+            self.pos_embed = None
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, model_dim))  # (1, 1, D)
+        # Specify the transformer encoder structure
+        encoder_layer = TransformerEncoderLayerWithAttn(d_model=model_dim, nhead=num_heads, batch_first=True, dim_feedforward=self.ff_dim, dropout=self.dropout)
+        # Stack the transformer encoder layers
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # Register hooks
+        for layer in self.transformer.layers:
+            layer.self_attn.register_forward_hook(self._save_attn_weights)
+            layer.self_attn.register_full_backward_hook(self._save_attn_grads)
+    def _save_attn_weights(self, module, input, output):
+        self.attn_weights.append(output[1].detach())
+    def _save_attn_grads(self, module, grad_input, grad_output):
+        self.attn_grads.append(grad_output[0].detach())
+    def encode(self, x, mask=None):
+        if x.dim() == 2:  # (B, S)
+            x = x.unsqueeze(-1)
+        elif x.dim() == 1:  # (S,)
+            x = x.unsqueeze(0).unsqueeze(-1)
+        elif x.dim() == 3:
+            pass
+        else:
+            raise ValueError(f"Unexpected input shape: {x.shape}")
+        x = self.input_fc(x)  # (B, S, D)
+        B, S, D = x.shape
+        if self.use_cls_token:
+            cls = self.cls_token.expand(B, -1, -1)  # (B, 1, D)
+            x = torch.cat([cls, x], dim=1)  # (B, S+1, D)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed.unsqueeze(0)[:, :x.shape[1], :]
+        elif self.pos_encoder is not None:
+            x = self.pos_encoder(x)
+        if mask is not None:
+            pad = torch.ones(B, 1, device=mask.device) if self.use_cls_token else 0
+            mask = torch.cat([pad, mask], dim=1) if self.use_cls_token else mask
+            x = x * mask.unsqueeze(-1)
+        encoded = self.transformer(x)
+        return encoded
+    def compute_attn_grad(self, reduction='mean'):
+        """
+        Computes attention × gradient scores across layers.
+        Returns: [B, S] tensor of importance scores
+        """
+        scores = []
+        for attn, grad in zip(self.attn_weights, self.attn_grads):
+            # attn: [B, H, S, S]
+            # grad: [B, S, D]
+            attn = attn.mean(dim=1)            # [B, S, S]
+            grad_norm = grad.norm(dim=-1)      # [B, S]
+            attn_grad_score = (attn * grad_norm.unsqueeze(1)).sum(dim=-1)  # [B, S]
+            scores.append(attn_grad_score)
+        # Combine across layers
+        stacked = torch.stack(scores, dim=0)  # [L, B, S]
+        if reduction == "mean":
+            return stacked.mean(dim=0)        # [B, S]
+        elif reduction == "sum":
+            return stacked.sum(dim=0)         # [B, S]
+        else:
+            return stacked                    # [L, B, S]
+    def compute_rollout(self):
+        """
+        Computes attention rollout: [B, S, S] final attention influence map
+        """
+        device = self.attn_weights[0].device
+        B, S = self.attn_weights[0].shape[0], self.attn_weights[0].shape[-1]
+        rollout = torch.eye(S, device=device).unsqueeze(0).repeat(B, 1, 1)  # [B, S, S]
+        for attn in self.attn_weights:
+            attn_heads = attn.mean(dim=1)  # [B, S, S]
+            attn_heads = attn_heads + torch.eye(S, device=device).unsqueeze(0)  # add residual
+            attn_heads = attn_heads / attn_heads.sum(dim=-1, keepdim=True).clamp(min=1e-6)
+            rollout = torch.bmm(attn_heads, rollout)  # [B, S, S]
+        return rollout  # [B, S, S]
+    def reset_attn_buffers(self):
+        self.attn_weights = []
+        self.attn_grads = []
+    def get_attn_layer(self, layer_idx=0, head_idx=None):
+        """
+        Returns attention map from a specific layer (and optionally head).
+        """
+        attn = self.attn_weights[layer_idx]  # [B, H, S, S]
+        if head_idx is not None:
+            attn = attn[:, head_idx]  # [B, S, S]
+        return attn
+    def apply_attn_interpretations_to_adata(self, dataloader, adata,
+                                            obsm_key_grad="attn_grad",
+                                            obsm_key_rollout="attn_rollout",
+                                            device="cpu"):
+        self.to(device)
+        self.eval()
+        grad_maps = []
+        rollout_maps = []
+        for batch in dataloader:
+            x = batch[0].to(device)
+            x.requires_grad_()
+            self.reset_attn_buffers()
+            logits = self(x)
+            if logits.shape[1] == 1:
+                target_score = logits.squeeze()
+            else:
+                target_score = logits.max(dim=1).values
+            target_score.sum().backward()
+            grad = self.compute_attn_grad()  # [B, S+1]
+            if self.use_cls_token:
+                grad = grad[:, 1:]  # ignore CLS token
+            grad_maps.append(grad.detach().cpu().numpy())
+        grad_concat = np.concatenate(grad_maps, axis=0)
+        adata.obsm[obsm_key_grad] = grad_concat
+        # add per-row normalized version
+        grad_normed = grad_concat / (np.max(grad_concat, axis=1, keepdims=True) + 1e-8)
+        adata.obsm[f"{obsm_key_grad}_normalized"] = grad_normed
+class TransformerClassifier(BaseTransformer):
+    def __init__(self,
+                 input_dim,
+                 num_classes,
+                 **kwargs):
+        super().__init__(input_dim, **kwargs)
+        # Classification head
+        output_size = 1 if num_classes == 2 else num_classes
+        self.cls_head = nn.Linear(self.model_dim, output_size)
+    def forward(self, x):
+        """
+        x: (batch, seq_len, input_dim)
+        """
+        self.reset_attn_buffers()
+        if x.dim() == 2:  # shape (B, S)
+            x = x.unsqueeze(-1)  # → (B, S, 1)
+        elif x.dim() == 1:
+            x = x.unsqueeze(0).unsqueeze(-1)  # just in case (S,) → (1, S, 1)
+        else:
+            pass
+        encoded = self.encode(x) # -> (B, S, D_model)
+        if self.use_cls_token:
+            pooled = encoded[:, 0]  # (B, D)
+        else:
+            pooled = encoded.mean(dim=1)  # (B, D)        out = self.cls_head(pooled) # -> (B, C)
+        out = self.cls_head(pooled)  # (B, C)
+        return out
+class DANNTransformerClassifier(TransformerClassifier):
+    def __init__(self, input_dim, model_dim, num_classes, n_domains, **kwargs):
+        super().__init__(input_dim, model_dim, num_classes, **kwargs)
+        self.domain_classifier = nn.Sequential(
+            nn.Linear(model_dim, 128),
+            nn.ReLU(),
+            nn.Linear(128, n_domains)
+        )
+    def forward(self, x, alpha=1.0):
+        encoded = self.encode(x)  # (B, S, D_model)
+        pooled = encoded.mean(dim=1)  # (B, D_model)
+        class_logits = self.cls_head(pooled)
+        domain_logits = self.domain_classifier(grad_reverse(pooled, alpha))
+        return class_logits, domain_logits
+class MaskedTransformerPretrainer(BaseTransformer):
+    def __init__(self, input_dim, model_dim, num_heads=4, num_layers=2, **kwargs):
+        super().__init__(input_dim, model_dim, num_heads, num_layers, **kwargs)
+        self.decoder = nn.Linear(model_dim, input_dim)
+    def forward(self, x, mask):
+        """
+        x: (batch, seq_len, input_dim)
+        mask: (batch, seq_len) optional
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(-1)
+        encoded = self.encode(x, mask=mask) # -> (B, S, D_model)
+        return self.decoder(encoded) # -> (B, D_input)
+class DANNTransformer(BaseTransformer):
+    """
+    """
+    def __init__(self, seq_len, model_dim, n_heads, n_layers, n_domains):
+        super().__init__(
+            input_dim=1,  # 1D scalar input per token
+            model_dim=model_dim,
+            num_heads=n_heads,
+            num_layers=n_layers,
+            seq_len=seq_len,
+            use_learnable_pos=True  # enables learnable pos_embed in base
+        )
+        # Reconstruction head
+        self.recon_head = nn.Linear(model_dim, 1)
+        # Domain classification head
+        self.domain_classifier = nn.Sequential(
+            nn.Linear(model_dim, 128),
+            nn.ReLU(),
+            nn.Linear(128, n_domains)
+        )
+    def forward(self, x, alpha=1.0):
+        """
+        x: Tensor of shape (B, S) or (B, S, 1)
+        alpha: GRL coefficient (float)
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(-1)  # (B, S, 1)
+        # Encode sequence
+        h = self.encode(x)  # (B, S, D_model)
+        # Head 1: Reconstruction
+        recon = self.recon_head(h).squeeze(-1)  # (B, S)
+        # Head 2: Domain classification via GRL
+        pooled = h.mean(dim=1)  # (B, D_model)
+        rev = grad_reverse(pooled, alpha)
+        domain_logits = self.domain_classifier(rev)  # (B, n_batches)
+        return recon, domain_logits

smftools/machine_learning/training/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .train_lightning_model import train_lightning_model, run_sliding_window_lightning_training
2	+ from .train_sklearn_model import train_sklearn_model, run_sliding_window_sklearn_training

smftools/machine_learning/training/train_lightning_model.py ADDED Viewed

@@ -0,0 +1,135 @@
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from ..data import AnnDataModule
+from ..models import TorchClassifierWrapper
+def train_lightning_model(
+    model,
+    datamodule,
+    max_epochs=30,
+    patience=5,
+    monitor_metric="val_loss",
+    checkpoint_path=None,
+    evaluate_test=True,
+    devices=1
+):
+    """
+    Takes a PyTorch Lightning Model and a Lightning DataLoader module to define a Lightning Trainer.
+    - The Lightning trainer fits the model to the training split of the datamodule.
+    - The Lightning trainer uses the validation split of the datamodule for monitoring training loss.
+    - Option of evaluating the trained model on a test set when evaluate_test is True.
+    - When using cuda, devices parameter can be: 1, [0,1], "all", "auto". Depending on what devices you want to use.
+    """
+    # Device logic
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+    elif torch.backends.mps.is_available():
+        accelerator = "mps"
+        devices = 1
+    else:
+        accelerator = "cpu"
+        devices = 1
+    # adds the train/val/test indices from the datamodule to the model class.
+    model.set_training_indices(datamodule)
+    # Callbacks
+    callbacks = [
+        EarlyStopping(monitor=monitor_metric, patience=patience, mode="min"),
+    ]
+    if checkpoint_path:
+        callbacks.append(ModelCheckpoint(
+            dirpath=checkpoint_path,
+            filename="{epoch}-{val_loss:.4f}",
+            monitor=monitor_metric,
+            save_top_k=1,
+            mode="min",
+        ))
+    # Trainer setup
+    trainer = Trainer(
+        max_epochs=max_epochs,
+        callbacks=callbacks,
+        accelerator=accelerator,
+        devices=devices,
+        log_every_n_steps=10,
+        enable_progress_bar=False
+    )
+    # Fit model with trainer
+    trainer.fit(model, datamodule=datamodule)
+    # Test model (if applicable)
+    if evaluate_test and hasattr(datamodule, "test_dataloader"):
+        trainer.test(model, datamodule=datamodule)
+    # Return best checkpoint path
+    best_ckpt = None
+    for cb in callbacks:
+        if isinstance(cb, ModelCheckpoint):
+            best_ckpt = cb.best_model_path
+    return trainer, best_ckpt
+def run_sliding_window_lightning_training(
+    adata,
+    tensor_source,
+    tensor_key,
+    label_col,
+    model_class,
+    num_classes,
+    class_names,
+    class_weights,
+    focus_class,
+    window_size,
+    stride,
+    max_epochs=30,
+    patience=5,
+    enforce_eval_balance: bool=False,
+    target_eval_freq: float=0.3,
+    max_eval_positive: int=None
+):
+    input_len = adata.shape[1]
+    results = {}
+    for start in range(0, input_len - window_size + 1, stride):
+        center_idx = start + window_size // 2
+        center_varname = adata.var_names[center_idx]
+        print(f"\nTraining window around {center_varname}")
+        # Build datamodule for this window
+        datamodule = AnnDataModule(
+            adata,
+            tensor_source=tensor_source,
+            tensor_key=tensor_key,
+            label_col=label_col,
+            batch_size=64,
+            window_start=start,
+            window_size=window_size
+        )
+        datamodule.setup()
+        # Build model for this window
+        model = model_class(window_size, num_classes)
+        wrapper = TorchClassifierWrapper(
+            model, label_col=label_col, num_classes=num_classes,
+            class_names=class_names,
+            class_weights=class_weights,
+            focus_class=focus_class, enforce_eval_balance=enforce_eval_balance,
+            target_eval_freq=target_eval_freq, max_eval_positive=max_eval_positive
+        )
+        # Train model
+        trainer, ckpt = train_lightning_model(
+            wrapper, datamodule, max_epochs=max_epochs, patience=patience
+        )
+        results[center_varname] = {
+            "model": wrapper,
+            "trainer": trainer,
+            "checkpoint": ckpt,
+            "metrics": trainer.callback_metrics
+        }
+    return results

smftools/machine_learning/training/train_sklearn_model.py ADDED Viewed

@@ -0,0 +1,114 @@
+from ..data import AnnDataModule
+from ..models import SklearnModelWrapper
+def train_sklearn_model(
+    model_wrapper,
+    datamodule,
+    evaluate_test=True,
+    evaluate_val=False
+):
+    """
+    Fits a SklearnModelWrapper on the train split from datamodule.
+    Evaluates on test and/or val set.
+    Parameters:
+        model_wrapper: SklearnModelWrapper instance
+        datamodule: AnnDataModule instance (with setup() method)
+        evaluate_test: whether to evaluate on test split
+        evaluate_val: whether to evaluate on validation split
+    Returns:
+        metrics: dictionary containing evaluation metrics
+    """
+    # Fit model
+    model_wrapper.fit_from_datamodule(datamodule)
+    # Evaluate
+    metrics = {}
+    if evaluate_val:
+        val_metrics = model_wrapper.evaluate_from_datamodule(datamodule, split="val")
+        metrics.update({f"{k}": v for k, v in val_metrics.items()})
+    if evaluate_test:
+        test_metrics = model_wrapper.evaluate_from_datamodule(datamodule, split="test")
+        metrics.update({f"{k}": v for k, v in test_metrics.items()})
+    # Plot evaluations
+    model_wrapper.plot_roc_pr_curves()
+    return metrics
+def run_sliding_window_sklearn_training(
+    adata,
+    tensor_source,
+    tensor_key,
+    label_col,
+    model_class,
+    num_classes,
+    class_names,
+    focus_class,
+    window_size,
+    stride,
+    batch_size=64,
+    train_frac=0.6,
+    val_frac=0.1,
+    test_frac=0.3,
+    random_seed=42,
+    enforce_eval_balance=False,
+    target_eval_freq=0.3,
+    max_eval_positive=None,
+    **model_kwargs
+):
+    """
+    Sliding window training for sklearn models using AnnData.
+    Returns dict keyed by window center.
+    """
+    input_len = adata.shape[1]
+    results = {}
+    for start in range(0, input_len - window_size + 1, stride):
+        center_idx = start + window_size // 2
+        center_varname = adata.var_names[center_idx]
+        print(f"\nTraining window around {center_varname}")
+        # Build datamodule for this window
+        datamodule = AnnDataModule(
+            adata,
+            tensor_source=tensor_source,
+            tensor_key=tensor_key,
+            label_col=label_col,
+            batch_size=batch_size,
+            window_start=start,
+            window_size=window_size,
+            train_frac=train_frac,
+            val_frac=val_frac,
+            test_frac=test_frac,
+            random_seed=random_seed
+        )
+        datamodule.setup()
+        # Build model wrapper
+        sklearn_model = model_class(**model_kwargs)
+        wrapper = SklearnModelWrapper(
+            sklearn_model,
+            num_classes=num_classes,
+            label_col=label_col,
+            class_names=class_names,
+            focus_class=focus_class,
+            enforce_eval_balance=enforce_eval_balance,
+            target_eval_freq=target_eval_freq,
+            max_eval_positive=max_eval_positive
+        )
+        # Fit and evaluate
+        metrics = train_sklearn_model(wrapper, datamodule, evaluate_test=True, evaluate_val=False)
+        results[center_varname] = {
+            "model": wrapper,
+            "metrics": metrics
+        }
+    return results

smftools/plotting/__init__.py CHANGED Viewed

@@ -1,6 +1,9 @@
+from .autocorrelation_plotting import *
+from .hmm_plotting import *
 from .position_stats import plot_bar_relative_risk, plot_volcano_relative_risk, plot_positionwise_matrix, plot_positionwise_matrix_grid
-from .general_plotting import combined_hmm_raw_clustermap
+from .general_plotting import combined_hmm_raw_clustermap, combined_raw_clustermap, plot_hmm_layers_rolling_by_sample_ref
 from .classifiers import plot_model_performance, plot_feature_importances_or_saliency, plot_model_curves_from_adata, plot_model_curves_from_adata_with_frequency_grid
+from .qc_plotting import *
 __all__ = [
     "combined_hmm_raw_clustermap",

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl