PyPI - spacr - Versions diffs - 0.0.36__py3-none-any.whl → 0.0.61__py3-none-any.whl - Mend

spacr 0.0.36py3-none-any.whl → 0.0.61py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

spacr/__init__.py +2 -2
spacr/__main__.py +0 -2
spacr/alpha.py +514 -2
spacr/annotate_app.py +112 -116
spacr/core.py +864 -728
spacr/deep_spacr.py +696 -0
spacr/foldseek.py +2 -16
spacr/graph_learning.py +297 -253
spacr/gui.py +9 -8
spacr/gui_2.py +90 -0
spacr/gui_classify_app.py +3 -4
spacr/gui_mask_app.py +9 -9
spacr/gui_measure_app.py +3 -5
spacr/gui_utils.py +132 -33
spacr/io.py +308 -464
spacr/mask_app.py +109 -5
spacr/measure.py +15 -1
spacr/models/cp/toxo_pv_lumen.CP_model +0 -0
spacr/old_code.py +69 -1
spacr/plot.py +23 -6
spacr/sequencing.py +1130 -0
spacr/sim.py +0 -42
spacr/timelapse.py +0 -1
spacr/train.py +172 -13
spacr/umap.py +0 -689
spacr/utils.py +1322 -75
{spacr-0.0.36.dist-info → spacr-0.0.61.dist-info}/METADATA +14 -29
spacr-0.0.61.dist-info/RECORD +39 -0
{spacr-0.0.36.dist-info → spacr-0.0.61.dist-info}/entry_points.txt +1 -0
spacr-0.0.36.dist-info/RECORD +0 -35
{spacr-0.0.36.dist-info → spacr-0.0.61.dist-info}/LICENSE +0 -0
{spacr-0.0.36.dist-info → spacr-0.0.61.dist-info}/WHEEL +0 -0
{spacr-0.0.36.dist-info → spacr-0.0.61.dist-info}/top_level.txt +0 -0

spacr/foldseek.py CHANGED Viewed

@@ -1,26 +1,12 @@
-import os, shutil, subprocess, tarfile, glob, requests, time, random
-import pandas as pd
-from scipy.stats import fisher_exact
-from statsmodels.stats.multitest import multipletests
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import seaborn as sns
-import matplotlib.pyplot as plt
+import os, shutil, subprocess, tarfile, requests
 import numpy as np
-import requests, time, random
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import pandas as pd
 from scipy.stats import fisher_exact
 from statsmodels.stats.multitest import multipletests
 from concurrent.futures import ProcessPoolExecutor, as_completed
-import pandas as pd
-from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 import seaborn as sns
 import matplotlib.pyplot as plt
-import numpy as np
-from matplotlib.ticker import FixedLocator
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 def run_command(command):
     print(f"Executing: {command}")

spacr/graph_learning.py CHANGED Viewed

@@ -1,276 +1,320 @@
 import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from collections import defaultdict
-from torch.utils.data import Dataset, DataLoader
+os.environ['DGLBACKEND'] = 'pytorch'
+import torch, dgl
 import pandas as pd
-import numpy as np
-import torch.optim as optim
-def generate_graphs(sequencing, scores, cell_min, gene_min_read):
-    # Load and preprocess sequencing (gene) data
-    gene_df = pd.read_csv(sequencing)
-    gene_df = gene_df.rename(columns={'prc': 'well_id', 'grna': 'gene_id', 'count': 'read_count'})
-    # Filter out genes with read counts less than gene_min_read
-    gene_df = gene_df[gene_df['read_count'] >= gene_min_read]
-    total_reads_per_well = gene_df.groupby('well_id')['read_count'].sum().reset_index(name='total_reads')
-    gene_df = gene_df.merge(total_reads_per_well, on='well_id')
-    gene_df['well_read_fraction'] = gene_df['read_count'] / gene_df['total_reads']
-    # Load and preprocess cell score data
-    cell_df = pd.read_csv(scores)
-    cell_df = cell_df[['prcfo', 'prc', 'pred']].rename(columns={'prcfo': 'cell_id', 'prc': 'well_id', 'pred': 'score'})
-    # Create a global mapping of gene IDs to indices
-    unique_genes = gene_df['gene_id'].unique()
-    gene_id_to_index = {gene_id: index for index, gene_id in enumerate(unique_genes)}
-    graphs = []
-    for well_id in pd.unique(gene_df['well_id']):
-        well_genes = gene_df[gene_df['well_id'] == well_id]
-        well_cells = cell_df[cell_df['well_id'] == well_id]
-        # Skip wells with no cells or genes or with fewer cells than threshold
-        if well_cells.empty or well_genes.empty or len(well_cells) < cell_min:
-            continue
-        # Initialize gene features tensor with zeros for all unique genes
-        gene_features = torch.zeros((len(gene_id_to_index), 1), dtype=torch.float)
-        # Update gene features tensor with well_read_fraction for genes present in this well
-        for _, row in well_genes.iterrows():
-            gene_index = gene_id_to_index[row['gene_id']]
-            gene_features[gene_index] = torch.tensor([[row['well_read_fraction']]])
-        # Prepare cell features (scores)
-        cell_features = torch.tensor(well_cells['score'].values, dtype=torch.float).view(-1, 1)
-        num_genes = len(gene_id_to_index)
-        num_cells = cell_features.size(0)
-        num_nodes = num_genes + num_cells
-        # Create adjacency matrix connecting each cell to all genes in the well
-        adj = torch.zeros((num_nodes, num_nodes), dtype=torch.float)
-        for _, row in well_genes.iterrows():
-            gene_index = gene_id_to_index[row['gene_id']]
-            adj[num_genes:, gene_index] = 1
-        graph = {
-            'adjacency_matrix': adj,
-            'gene_features': gene_features,
-            'cell_features': cell_features,
-            'num_cells': num_cells,
-            'num_genes': num_genes
-        }
-        graphs.append(graph)
-    print(f'Generated dataset with {len(graphs)} graphs')
-    return graphs, gene_id_to_index
-def print_graphs_info(graphs, gene_id_to_index):
-    # Invert the gene_id_to_index mapping for easy lookup
-    index_to_gene_id = {v: k for k, v in gene_id_to_index.items()}
-    for i, graph in enumerate(graphs, start=1):
-        print(f"Graph {i}:")
-        num_genes = graph['num_genes']
-        num_cells = graph['num_cells']
-        gene_features = graph['gene_features']
-        cell_features = graph['cell_features']
-        print(f"  Number of Genes: {num_genes}")
-        print(f"  Number of Cells: {num_cells}")
-        # Identify genes present in the graph based on non-zero feature values
-        present_genes = [index_to_gene_id[idx] for idx, feature in enumerate(gene_features) if feature.item() > 0]
-        print("  Genes present in this Graph:", present_genes)
-        # Display gene features for genes present in the graph
-        print("  Gene Features:")
-        for gene_id in present_genes:
-            idx = gene_id_to_index[gene_id]
-            print(f"    {gene_id}: {gene_features[idx].item()}")
-        # Display a sample of cell features, for brevity
-        print("  Cell Features (sample):")
-        for idx, feature in enumerate(cell_features[:min(5, len(cell_features))]):
-            print(f"    Cell {idx+1}: {feature.item()}")
-        print("-" * 40)
-class Attention(nn.Module):
-    def __init__(self, feature_dim, attn_dim, dropout_rate=0.1):
-        super(Attention, self).__init__()
-        self.query = nn.Linear(feature_dim, attn_dim)
-        self.key = nn.Linear(feature_dim, attn_dim)
-        self.value = nn.Linear(feature_dim, feature_dim)
-        self.scale = 1.0 / (attn_dim ** 0.5)
-        self.dropout = nn.Dropout(dropout_rate)
-    def forward(self, gene_features, cell_features):
-        # Queries come from the cell features
-        q = self.query(cell_features)
-        # Keys and values come from the gene features
-        k = self.key(gene_features)
-        v = self.value(gene_features)
-        # Compute attention weights
-        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
-        attn_weights = F.softmax(attn_weights, dim=-1)
-        # Apply dropout to attention weights
-        attn_weights = self.dropout(attn_weights)
-        # Apply attention weights to the values
-        attn_output = torch.matmul(attn_weights, v)
-        return attn_output, attn_weights
-class GraphTransformer(nn.Module):
-    def __init__(self, gene_feature_size, cell_feature_size, hidden_dim, output_dim, attn_dim, dropout_rate=0.1):
-        super(GraphTransformer, self).__init__()
-        self.gene_transform = nn.Linear(gene_feature_size, hidden_dim)
-        self.cell_transform = nn.Linear(cell_feature_size, hidden_dim)
-        self.dropout = nn.Dropout(dropout_rate)
-        # Attention layer to let each cell attend to all genes
-        self.attention = Attention(hidden_dim, attn_dim)
-        # This layer is used to transform the combined features after attention
-        self.combine_transform = nn.Linear(2 * hidden_dim, hidden_dim)
-        # Output layer for predicting cell scores, ensuring it matches the number of cells
-        self.cell_output = nn.Linear(hidden_dim, output_dim)
+import torch.nn as nn
+from torchvision import datasets, transforms
+from sklearn.preprocessing import StandardScaler
+from PIL import Image
+import dgl.nn.pytorch as dglnn
+from sklearn.datasets import make_classification
+from .utils import SelectChannels
+# approach outline
+#
+#    1. Data Preparation:
+#        Test Mode: Load MNIST data and generate synthetic gRNA data.
+#        Real Data: Load image paths and sequencing data as fractions.
+#
+#    2. Graph Construction:
+#        Each well is represented as a graph.
+#        Each graph has cell nodes (with image features) and gRNA nodes (with gRNA fraction features).
+#        Each cell node is connected to each gRNA node within the same well.
+#
+#    3. Model Training:
+#        Use an encoder-decoder architecture with the Graph Transformer model.
+#        The encoder processes the cell and gRNA nodes.
+#        The decoder outputs the phenotype score for each cell node.
+#        The model is trained on all wells (including positive and negative controls).
+#        The model learns to score the gRNA in column 1 (negative control) as 0 and the gRNA in column 2 (positive control) as 1 based on the cell features.
+#
+#    4. Model Application:
+#        Apply the trained model to all wells to get classification probabilities.
+#
+#    5. Evaluation:
+#        Evaluate the model's performance using the control wells.
+#
+#    6. Association Analysis:
+#        Analyze the association between gRNAs and the classification scores.
+#
+# The model learns the associations between cell features and phenotype scores based on the controls and then generalizes this learning to the screening wells.
+# Load MNIST data for testing
+def load_mnist_data():
+    transform = transforms.Compose([
+        transforms.Resize((28, 28)),
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
+    mnist_test = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
+    return mnist_train, mnist_test
+# Generate synthetic gRNA data
+def generate_synthetic_grna_data(n_samples, n_features):
+    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=5, n_redundant=0, n_classes=2, random_state=42)
+    synthetic_data = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
+    synthetic_data['label'] = y
+    return synthetic_data
+# Preprocess image
+def preprocess_image(image_path, image_size=224, channels=[1,2,3], normalize=True):
+    if normalize:
+        preprocess = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels),
+            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
+    else:
+        preprocess = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels)])
+    image = Image.open(image_path).convert('RGB')
+    return preprocess(image)
+def extract_metadata_from_path(path):
+    """
+    Extract metadata from the image path.
+    The path format is expected to be plate_well_field_objectnumber.png
+    Parameters:
+    path (str): The path to the image file.
+    Returns:
+    dict: A dictionary with the extracted metadata.
+    """
+    filename = os.path.basename(path)
+    name, ext = os.path.splitext(filename)
+    # Ensure the file has the correct extension
+    if ext.lower() != '.png':
+        raise ValueError("Expected a .png file")
+    # Split the name by underscores
+    parts = name.split('_')
+    if len(parts) != 4:
+        raise ValueError("Expected filename format: plate_well_field_objectnumber.png")
+    plate, well, field, object_number = parts
+    return {'plate': plate, 'well': well,'field': field, 'object_number': object_number}
+# Load images
+def load_images(image_paths, image_size=224, channels=[1,2,3], normalize=True):
+    images = []
+    metadata_list = []
+    for path in image_paths:
+        image = preprocess_image(path, image_size, channels, normalize)
+        images.append(image)
+        metadata = extract_metadata_from_path(path)  # Extract metadata from image path or database
+        metadata_list.append(metadata)
+    return torch.stack(images), metadata_list
+# Normalize sequencing data
+def normalize_sequencing_data(sequencing_data):
+    scaler = StandardScaler()
+    sequencing_data.iloc[:, 2:] = scaler.fit_transform(sequencing_data.iloc[:, 2:])
+    return sequencing_data
+# Construct graph for each well
+def construct_well_graph(images, image_metadata, grna_data):
+    cell_nodes = len(images)
+    grna_nodes = grna_data.shape[0]
+    graph = dgl.DGLGraph()
+    graph.add_nodes(cell_nodes + grna_nodes)
-    def forward(self, adjacency_matrix, gene_features, cell_features):
-        # Apply initial transformation to gene and cell features
-        transformed_gene_features = F.relu(self.gene_transform(gene_features))
-        transformed_cell_features = F.relu(self.cell_transform(cell_features))
+    cell_features = torch.stack(images)
+    grna_features = torch.tensor(grna_data).float()
-        # Incorporate attention mechanism
-        attn_output, attn_weights = self.attention(transformed_gene_features, transformed_cell_features)
+    features = torch.cat([cell_features, grna_features], dim=0)
+    graph.ndata['features'] = features
-        # Combine the transformed cell features with the attention output features
-        combined_cell_features = torch.cat((transformed_cell_features, attn_output), dim=1)
-        # Apply dropout here as well
-        combined_cell_features = self.dropout(combined_cell_features)
+    for i in range(cell_nodes):
+        for j in range(cell_nodes, cell_nodes + grna_nodes):
+            graph.add_edge(i, j)
+            graph.add_edge(j, i)
+    return graph
-        combined_cell_features = F.relu(self.combine_transform(combined_cell_features))
+def create_graphs_for_wells(images, metadata_list, sequencing_data):
+    graphs = []
+    labels = []
-        # Combine gene and cell features for message passing
-        combined_features = torch.cat((transformed_gene_features, combined_cell_features), dim=0)
+    for well in sequencing_data['well'].unique():
+        well_images = [img for img, meta in zip(images, metadata_list) if meta['well'] == well]
+        well_metadata = [meta for meta in metadata_list if meta['well'] == well]
+        well_grna_data = sequencing_data[sequencing_data['well'] == well].iloc[:, 2:].values
-        # Apply message passing via adjacency matrix multiplication
-        message_passed_features = torch.matmul(adjacency_matrix, combined_features)
+        graph = construct_well_graph(well_images, well_metadata, well_grna_data)
+        graphs.append(graph)
-        # Predict cell scores from the post-message passed cell features
-        cell_scores = self.cell_output(message_passed_features[-cell_features.size(0):])
+        if well_metadata[0]['column'] == 1:  # Negative control
+            labels.append(0)
+        elif well_metadata[0]['column'] == 2:  # Positive control
+            labels.append(1)
+        else:
+            labels.append(-1)  # Screen wells, will be used for evaluation
+    return graphs, labels
+# Define Encoder-Decoder Transformer Model
+class Encoder(nn.Module):
+    def __init__(self, in_feats, hidden_feats):
+        super(Encoder, self).__init__()
+        self.conv1 = dglnn.GraphConv(in_feats, hidden_feats)
+        self.conv2 = dglnn.GraphConv(hidden_feats, hidden_feats)
+    def forward(self, g, features):
+        x = self.conv1(g, features)
+        x = torch.relu(x)
+        x = self.conv2(g, x)
+        x = torch.relu(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, hidden_feats, out_feats):
+        super(Decoder, self).__init__()
+        self.linear = nn.Linear(hidden_feats, out_feats)
+    def forward(self, x):
+        return self.linear(x)
-        return cell_scores, attn_weights
-def train_graph_transformer(graphs, lr=0.01, dropout_rate=0.1, weight_decay=0.00001, epochs=100, save_fldr='', acc_threshold = 0.1):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model = GraphTransformer(gene_feature_size=1, cell_feature_size=1, hidden_dim=256, output_dim=1, attn_dim=128, dropout_rate=dropout_rate).to(device)
+class GraphTransformer(nn.Module):
+    def __init__(self, in_feats, hidden_feats, out_feats):
+        super(GraphTransformer, self).__init__()
+        self.encoder = Encoder(in_feats, hidden_feats)
+        self.decoder = Decoder(hidden_feats, out_feats)
-    criterion = nn.MSELoss()
-    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    def forward(self, g, features):
+        x = self.encoder(g, features)
+        with g.local_scope():
+            g.ndata['h'] = x
+            hg = dgl.mean_nodes(g, 'h')
+        return self.decoder(hg)
-    training_log = []
-    accumulate_grad_batches=1
-    threshold=acc_threshold
+def train(graphs, labels, model, loss_fn, optimizer, epochs=100):
     for epoch in range(epochs):
         model.train()
         total_loss = 0
-        total_correct = 0
-        total_samples = 0
-        optimizer.zero_grad()
-        batch_count = 0  # Initialize batch_count
+        correct = 0
+        total = 0
-        for graph in graphs:
-            adjacency_matrix = graph['adjacency_matrix'].to(device)
-            gene_features = graph['gene_features'].to(device)
-            cell_features = graph['cell_features'].to(device)
-            num_cells = graph['num_cells']
-            predictions, attn_weights = model(adjacency_matrix, gene_features, cell_features)
-            predictions = predictions.squeeze()
-            true_scores = cell_features[:num_cells, 0]
-            loss = criterion(predictions, true_scores) / accumulate_grad_batches
+        for graph, label in zip(graphs, labels):
+            if label == -1:
+                continue  # Skip screen wells for training
+            features = graph.ndata['features']
+            logits = model(graph, features)
+            loss = loss_fn(logits, torch.tensor([label]))
+            optimizer.zero_grad()
             loss.backward()
-            # Calculate "accuracy"
-            with torch.no_grad():
-                correct_predictions = (torch.abs(predictions - true_scores) / true_scores <= threshold).sum().item()
-                total_correct += correct_predictions
-                total_samples += num_cells
-            batch_count += 1  # Increment batch_count
-            if batch_count % accumulate_grad_batches == 0 or batch_count == len(graphs):
-                optimizer.step()
-                optimizer.zero_grad()
-            total_loss += loss.item() * accumulate_grad_batches
+            optimizer.step()
+            total_loss += loss.item()
+            _, predicted = torch.max(logits, 1)
+            correct += (predicted == label).sum().item()
+            total += 1
-        accuracy = total_correct / total_samples
-        training_log.append({"Epoch": epoch+1, "Average Loss": total_loss / len(graphs), "Accuracy": accuracy})
-        print(f"Epoch {epoch+1}, Loss: {total_loss / len(graphs)}, Accuracy: {accuracy}", end="\r", flush=True)
-    # Save the training log and model as before
-    os.makedirs(save_fldr, exist_ok=True)
-    log_path = os.path.join(save_fldr, 'training_log.csv')
-    training_log_df = pd.DataFrame(training_log)
-    training_log_df.to_csv(log_path, index=False)
-    print(f"Training log saved to {log_path}")
-    model_path = os.path.join(save_fldr, 'model.pth')
-    torch.save(model.state_dict(), model_path)
-    print(f"Model saved to {model_path}")
+        accuracy = correct / total if total > 0 else 0
+        print(f'Epoch {epoch}, Loss: {total_loss / total:.4f}, Accuracy: {accuracy * 100:.2f}%')
-    return model
-def annotate_cells_with_genes(graphs, model, gene_id_to_index):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
+def apply_model(graphs, model):
     model.eval()
-    annotated_data = []
+    results = []
     with torch.no_grad():
         for graph in graphs:
-            adjacency_matrix = graph['adjacency_matrix'].to(device)
-            gene_features = graph['gene_features'].to(device)
-            cell_features = graph['cell_features'].to(device)
-            predictions, attn_weights = model(adjacency_matrix, gene_features, cell_features)
-            predictions = np.atleast_1d(predictions.squeeze().cpu().numpy())
-            attn_weights = np.atleast_2d(attn_weights.squeeze().cpu().numpy())
-            # This approach assumes all genes in gene_id_to_index are used in the model.
-            # Create a list of gene IDs present in this specific graph.
-            present_gene_ids = [key for key, value in gene_id_to_index.items() if value < gene_features.size(0)]
-            for cell_idx in range(cell_features.size(0)):
-                true_score = cell_features[cell_idx, 0].item()
-                predicted_score = predictions[cell_idx]
-                # Find the index of the most probable gene.
-                most_probable_gene_idx = attn_weights[cell_idx].argmax()
-                if len(present_gene_ids) > most_probable_gene_idx:  # Ensure index is within the range
-                    most_probable_gene_id = present_gene_ids[most_probable_gene_idx]
-                    most_probable_gene_score = attn_weights[cell_idx, most_probable_gene_idx] if attn_weights.ndim > 1 else attn_weights[most_probable_gene_idx]
-                    annotated_data.append({
-                        "Cell ID": cell_idx,
-                        "Most Probable Gene": most_probable_gene_id,
-                        "Cell Score": true_score,
-                        "Predicted Cell Score": predicted_score,
-                        "Probability Score for Highest Gene": most_probable_gene_score
-                    })
-                else:
-                    # Handle the case where the index is out of bounds - this should not happen but is here for robustness
-                    print("Error: Gene index out of bounds. This might indicate a mismatch in the model's output.")
-    return pd.DataFrame(annotated_data)
+            features = graph.ndata['features']
+            logits = model(graph, features)
+            probabilities = torch.softmax(logits, dim=1)
+            results.append(probabilities[:, 1].item())
+    return results
+def analyze_associations(probabilities, sequencing_data):
+    # Analyze associations between gRNAs and classification scores
+    sequencing_data['positive_prob'] = probabilities
+    return sequencing_data.groupby('gRNA').positive_prob.mean().sort_values(ascending=False)
+def train_graph_transformer(src, lr=0.01, epochs=100, hidden_feats=128, n_classes=2, row_limit=None, image_size=224, channels=[1,2,3], normalize=True, test_mode=False):
+    if test_mode:
+        # Load MNIST data
+        mnist_train, mnist_test = load_mnist_data()
+        # Generate synthetic gRNA data
+        synthetic_grna_data = generate_synthetic_grna_data(len(mnist_train), 10)  # 10 synthetic features
+        sequencing_data = synthetic_grna_data
+        # Load MNIST images and metadata
+        images = []
+        metadata_list = []
+        for idx, (img, label) in enumerate(mnist_train):
+            images.append(img)
+            metadata_list.append({'index': idx, 'plate': 'plate1', 'well': idx, 'column': label})
+        images = torch.stack(images)
+        # Normalize synthetic sequencing data
+        sequencing_data = normalize_sequencing_data(sequencing_data)
+    else:
+        from .io import _read_and_join_tables
+        from .utils import get_db_paths, get_sequencing_paths, correct_paths
+        db_paths = get_db_paths(src)
+        seq_paths = get_sequencing_paths(src)
+        if isinstance(src, str):
+            src = [src]
+        sequencing_data = pd.DataFrame()
+        for seq in seq_paths:
+            sequencing_df = pd.read_csv(seq)
+            sequencing_data = pd.concat([sequencing_data, sequencing_df], axis=0)
+        all_df = pd.DataFrame()
+        for db_path in db_paths:
+            df = _read_and_join_tables(db_path, table_names=['png_list'])
+            all_df = pd.concat([all_df, df], axis=0)
+        tables = ['png_list']
+        all_df = pd.DataFrame()
+        image_paths = []
+        for i, db_path in enumerate(db_paths):
+            df = _read_and_join_tables(db_path, table_names=tables)
+            df, image_paths_tmp = correct_paths(df, src[i])
+            all_df = pd.concat([all_df, df], axis=0)
+            image_paths.extend(image_paths_tmp)
+        if row_limit is not None:
+            all_df = all_df.sample(n=row_limit, random_state=42)
+        images, metadata_list = load_images(image_paths, image_size, channels, normalize)
+        sequencing_data = normalize_sequencing_data(sequencing_data)
+    # Step 1: Create graphs for each well
+    graphs, labels = create_graphs_for_wells(images, metadata_list, sequencing_data)
+    # Step 2: Train Graph Transformer Model
+    in_feats = graphs[0].ndata['features'].shape[1]
+    model = GraphTransformer(in_feats, hidden_feats, n_classes)
+    loss_fn = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    # Train the model
+    train(graphs, labels, model, loss_fn, optimizer, epochs)
+    # Step 3: Apply the model to all wells (including screen wells)
+    screen_graphs = [graph for graph, label in zip(graphs, labels) if label == -1]
+    probabilities = apply_model(screen_graphs, model)
+    # Step 4: Analyze associations between gRNAs and classification scores
+    associations = analyze_associations(probabilities, sequencing_data)
+    print("Top associated gRNAs with positive control phenotype:")
+    print(associations.head())
+    return model, associations

spacr 0.0.36__py3-none-any.whl → 0.0.61__py3-none-any.whl

spacr 0.0.36py3-none-any.whl → 0.0.61py3-none-any.whl