PyPI - spacr - Versions diffs - 0.3.47__py3-none-any.whl → 0.3.50__py3-none-any.whl - Mend

spacr 0.3.47py3-none-any.whl → 0.3.50py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

spacr/chat_bot.py +31 -0
spacr/gui_elements.py +33 -7
spacr/ml.py +453 -141
spacr/plot.py +460 -29
spacr/sequencing.py +5 -2
spacr/settings.py +1 -1
spacr/toxo.py +267 -158
spacr/utils.py +12 -4
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/METADATA +2 -1
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/RECORD +14 -13
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/LICENSE +0 -0
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/WHEEL +0 -0
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/entry_points.txt +0 -0
{spacr-0.3.47.dist-info → spacr-0.3.50.dist-info}/top_level.txt +0 -0

spacr/ml.py CHANGED Viewed

@@ -4,15 +4,24 @@ import numpy as np
 from scipy import stats
 from scipy.stats import shapiro
+from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
+from sklearn.metrics import mean_squared_error
+import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
+import scipy.stats as st
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
+from statsmodels.tools import add_constant
 from statsmodels.regression.mixed_linear_model import MixedLM
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
 from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.genmod.families import Binomial
+from statsmodels.genmod.families.links import logit
+from statsmodels.othermod.betareg import BetaModel
+from scipy.optimize import minimize
+from scipy.special import gammaln, psi, expit
 from sklearn.linear_model import Lasso, Ridge
 from sklearn.preprocessing import FunctionTransformer
 from patsy import dmatrices
@@ -24,17 +33,30 @@ from sklearn.inspection import permutation_importance
 from sklearn.metrics import classification_report, precision_recall_curve
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from scipy.spatial.distance import cosine, euclidean, mahalanobis, cityblock, minkowski, chebyshev, hamming, jaccard, braycurtis
 from xgboost import XGBClassifier
+import numpy as np
+from scipy.stats import kstest, normaltest
+import statsmodels.api as sm
 import matplotlib
 matplotlib.use('Agg')
 import warnings
 warnings.filterwarnings("ignore", message="3D stack used, but stitch_threshold=0 and do_3D=False, so masks are made per plane only")
+class QuasiBinomial(Binomial):
+    """Custom Quasi-Binomial family with adjustable variance."""
+    def __init__(self, link=logit(), dispersion=1.0):
+        super().__init__(link=link)
+        self.dispersion = dispersion
+    def variance(self, mu):
+        """Adjust the variance with the dispersion parameter."""
+        return self.dispersion * super().variance(mu)
 def calculate_p_values(X, y, model):
     # Predict y values
     y_pred = model.predict(X)
@@ -75,46 +97,6 @@ def perform_mixed_model(y, X, groups, alpha=1.0):
     result = model.fit()
     return result
-def regression_model(X, y, regression_type='ols', groups=None, alpha=1.0, cov_type=None):
-    def plot_regression_line(X, y, model):
-        """Helper to plot regression line for lasso and ridge models."""
-        y_pred = model.predict(X)
-        plt.scatter(X.iloc[:, 1], y, color='blue', label='Data')
-        plt.plot(X.iloc[:, 1], y_pred, color='red', label='Regression line')
-        plt.xlabel('Features')
-        plt.ylabel('Dependent Variable')
-        plt.legend()
-        plt.show()
-    # Define the dictionary with callables (lambdas) to delay evaluation
-    model_map = {
-        'ols': lambda: sm.OLS(y, X).fit(cov_type=cov_type) if cov_type else sm.OLS(y, X).fit(),
-        'gls': lambda: sm.GLS(y, X).fit(),
-        'wls': lambda: sm.WLS(y, X, weights=1 / np.sqrt(X.iloc[:, 1])).fit(),
-        'rlm': lambda: sm.RLM(y, X, M=sm.robust.norms.HuberT()).fit(),
-        'glm': lambda: sm.GLM(y, X, family=sm.families.Gaussian()).fit(),
-        'quantile': lambda: sm.QuantReg(y, X).fit(q=alpha),
-        'logit': lambda: sm.Logit(y, X).fit(),
-        'probit': lambda: sm.Probit(y, X).fit(),
-        'poisson': lambda: sm.Poisson(y, X).fit(),
-        'lasso': lambda: Lasso(alpha=alpha).fit(X, y),
-        'ridge': lambda: Ridge(alpha=alpha).fit(X, y)
-    }
-    # Call the appropriate model only when needed
-    if regression_type in model_map:
-        model = model_map[regression_type]()
-    elif regression_type == 'mixed':
-        model = perform_mixed_model(y, X, groups, alpha=alpha)
-    else:
-        raise ValueError(f"Unsupported regression type {regression_type}")
-    if regression_type in ['lasso', 'ridge']:
-        plot_regression_line(X, y, model)
-    return model
 def create_volcano_filename(csv_path, regression_type, alpha, dst):
     """Create and return the volcano plot filename based on regression type and alpha."""
     volcano_filename = os.path.splitext(os.path.basename(csv_path))[0] + '_volcano_plot.pdf'
@@ -173,6 +155,41 @@ def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
     coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
     return coef_df[~coef_df['feature'].str.contains('row|column')]
+def check_distribution(y):
+    """Check the type of distribution to recommend a model."""
+    if np.all((y == 0) | (y == 1)):
+        print("Detected binary data.")
+        return 'logit'
+    elif (y > 0).all() and (y < 1).all():
+        print("Detected continuous data between 0 and 1 (excluding 0 and 1).")
+        return 'beta'
+    elif (y >= 0).all() and (y <= 1).all():
+        print("Detected continuous data between 0 and 1 (including 0 or 1).")
+        # Consider quasi-binomial regression
+        return 'quasi_binomial'
+    else:
+        print("Using OLS as a fallback.")
+        return 'ols'
+def select_glm_family(y):
+    """Select the appropriate GLM family based on the data."""
+    if np.all((y == 0) | (y == 1)):
+        print("Using Binomial family (for binary data).")
+        return sm.families.Binomial()
+    elif (y >= 0).all() and (y <= 1).all():
+        print("Using Quasi-Binomial family (for proportion data including 0 and 1).")
+        return QuasiBinomial()
+    elif np.all(y.astype(int) == y) and (y >= 0).all():
+        print("Using Poisson family (for count data).")
+        return sm.families.Poisson()
+    else:
+        print("Using Gaussian family (for continuous data).")
+        return sm.families.Gaussian()
 def prepare_formula(dependent_variable, random_row_column_effects=False):
     """Return the regression formula using random effects for plate, row, and column."""
     if random_row_column_effects:
@@ -290,25 +307,345 @@ def check_and_clean_data(df, dependent_variable):
     print("Data is ready for model fitting.")
     return df_cleaned
-def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False, nc='233460', pc='220950', controls=[''], dst=None, cov_type=None, plot=False):
-    from .plot import volcano_plot, plot_histogram
+def check_normality(y, variable_name):
+    """Check if the data is normally distributed using the Shapiro-Wilk test."""
+    from scipy.stats import shapiro
-    # Generate the volcano filename
-    volcano_path = create_volcano_filename(csv_path, regression_type, alpha, dst)
+    stat, p = shapiro(y)
+    alpha = 0.05
+    if p > alpha:
+        print(f"{variable_name} is normally distributed (fail to reject H0)")
+        return True
+    else:
+        print(f"{variable_name} is not normally distributed (reject H0)")
+        return False
+def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance=0.02, smoothing=10, increment=10):
+    """
+    Plot the mean absolute difference with standard deviation as shaded area vs. sample size.
+    Detect and mark the elbow point (inflection) with smoothing and tolerance control.
+    """
-    # Check if the data is normally distributed
-    is_normal = check_normality(df[dependent_variable], dependent_variable)
+    from spacr.utils import correct_metadata_column_names
-    if is_normal:
-        print(f"To avoid violating assumptions, it is recommended to use a regression model that assumes normality.")
-        print(f"Recommended regression type: ols (Ordinary Least Squares)")
+    # Load and process data
+    if isinstance(settings['score_data'], str):
+        settings['score_data'] = [settings['score_data']]
+    dfs = []
+    for i, score_data in enumerate(settings['score_data']):
+        df = pd.read_csv(score_data)
+        df = correct_metadata_column_names(df)
+        df['plate'] = f'plate{i + 1}'
+        df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
+        dfs.append(df)
+    df = pd.concat(dfs, axis=0)
+    # Compute the number of cells per well and select the top 100 wells by cell count
+    cell_counts = df.groupby('prc').size().reset_index(name='cell_count')
+    top_wells = cell_counts.nlargest(sample_size, 'cell_count')['prc']
+    # Filter the data to include only the top 100 wells
+    df = df[df['prc'].isin(top_wells)]
+    # Initialize storage for absolute difference data
+    diff_data = []
+    # Group by wells and iterate over them
+    for i, (prc, group) in enumerate(df.groupby('prc')):
+        original_mean = group[settings['score_column']].mean()  # Original full-well mean
+        max_cells = len(group)
+        sample_sizes = np.arange(2, max_cells + 1, increment)  # Sample sizes from 2 to max cells
+        # Iterate over sample sizes and compute absolute difference
+        for sample_size in sample_sizes:
+            abs_diffs = []
+            # Perform multiple random samples to reduce noise
+            for _ in range(num_repeats):
+                sample = group.sample(n=sample_size, replace=False)
+                sampled_mean = sample[settings['score_column']].mean()
+                abs_diff = abs(sampled_mean - original_mean)  # Absolute difference
+                abs_diffs.append(abs_diff)
+            # Compute the average absolute difference across all repeats
+            avg_abs_diff = np.mean(abs_diffs)
+            # Store the result for plotting
+            diff_data.append((sample_size, avg_abs_diff))
+    # Convert absolute difference data to DataFrame for plotting
+    diff_df = pd.DataFrame(diff_data, columns=['sample_size', 'avg_abs_diff'])
+    # Group by sample size to calculate mean and standard deviation
+    summary_df = diff_df.groupby('sample_size').agg(
+        mean_abs_diff=('avg_abs_diff', 'mean'),
+        std_abs_diff=('avg_abs_diff', 'std')
+    ).reset_index()
+    # Apply smoothing using a rolling window
+    summary_df['smoothed_mean_abs_diff'] = summary_df['mean_abs_diff'].rolling(window=smoothing, min_periods=1).mean()
+    # Detect the elbow point (where mean_abs_diff < tolerance)
+    elbow_df = summary_df[summary_df['smoothed_mean_abs_diff'] <= tolerance]
+    # Select the first occurrence if it exists; otherwise, use the last point
+    if not elbow_df.empty:
+        elbow_point = elbow_df.iloc[0]  # First point where the condition is met
     else:
-        print(f"To avoid violating assumptions, it is recommended to use a regression model that does not assume normality.")
-        print(f"Recommended regression type: glm (Generalized Linear Model)")
+        elbow_point = summary_df.iloc[-1]  # Fallback to the last point
+    # Plot the mean absolute difference with standard deviation as shaded area
+    fig, ax = plt.subplots(figsize=(10, 10))
+    ax.plot(
+        summary_df['sample_size'], summary_df['smoothed_mean_abs_diff'], color='teal', label='Smoothed Mean Absolute Difference'
+    )
+    ax.fill_between(
+        summary_df['sample_size'],
+        summary_df['smoothed_mean_abs_diff'] - summary_df['std_abs_diff'],
+        summary_df['smoothed_mean_abs_diff'] + summary_df['std_abs_diff'],
+        color='teal', alpha=0.3, label='±1 Std. Dev.'
+    )
+    # Mark the elbow point (inflection) on the plot
+    ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
+    # Formatting the plot
+    ax.set_xlabel('Sample Size')
+    ax.set_ylabel('Mean Absolute Difference')
+    ax.set_title('Mean Absolute Difference vs. Sample Size with Standard Deviation')
+    ax.legend().remove()
+    # Save the plot if a destination is provided
+    dst = os.path.dirname(settings['count_data'][0])
+    if dst is not None:
+        fig_path = os.path.join(dst, 'results')
+        os.makedirs(fig_path, exist_ok=True)
+        fig_file_path = os.path.join(fig_path, 'cell_min_threshold.pdf')
+        fig.savefig(fig_file_path, format='pdf', dpi=600, bbox_inches='tight')
+        print(f"Saved {fig_file_path}")
+    plt.show()
+    return elbow_point['sample_size']
+def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
+    """Return DataFrame of model coefficients, standard errors, and p-values."""
+    if regression_type == 'beta':
+        # Extract coefficients and standard errors
+        coefs = model.params
+        std_err = model.bse
+        # Compute Wald test (coefficient / standard error)
+        wald_stats = coefs / std_err
+        # Calculate two-tailed p-values
+        p_values = 2 * (1 - st.norm.cdf(np.abs(wald_stats)))
+        coef_df = pd.DataFrame({
+            'feature': coefs.index,
+            'coefficient': coefs.values,
+            'std_err': std_err.values,
+            'wald_stat': wald_stats.values,
+            'p_value': p_values
+        })
+    elif regression_type in ['ols', 'glm', 'logit', 'probit', 'quasi_binomial']:
+        coefs = model.params
+        p_values = model.pvalues
+        coef_df = pd.DataFrame({
+            'feature': coefs.index,
+            'coefficient': coefs.values,
+            'p_value': p_values.values
+        })
+    elif regression_type in ['ridge', 'lasso']:
+        coefs = model.coef_.flatten()
+        p_values = calculate_p_values(X, y, model)
+        coef_df = pd.DataFrame({
+            'feature': X.columns,
+            'coefficient': coefs,
+            'p_value': p_values
+        })
+    else:
+        raise ValueError(f"Unsupported regression type: {regression_type}")
+    # Additional formatting
+    coef_df['-log10(p_value)'] = -np.log10(coef_df['p_value'])
+    coef_df['grna'] = coef_df['feature'].str.extract(r'\[(.*?)\]')[0]
+    coef_df['condition'] = coef_df.apply(
+        lambda row: 'nc' if nc in row['feature'] else
+                    'pc' if pc in row['feature'] else
+                    ('control' if row['grna'] in controls else 'other'),
+        axis=1
+    )
+    return coef_df[~coef_df['feature'].str.contains('row|column')]
+def check_distribution(y, epsilon=1e-6):
+    """Check the distribution of y and recommend an appropriate model."""
+    # Check if the dependent variable is binary (only 0 and 1)
+    if np.all((y == 0) | (y == 1)):
+        print("Detected binary data.")
+        return 'logit'
+    # Continuous data between 0 and 1 (excluding exact 0 and 1)
+    elif (y > 0).all() and (y < 1).all():
+        # Check if the data is close to 0 or 1 (boundary issues)
+        if np.any((y < epsilon) | (y > 1 - epsilon)):
+            print("Detected continuous data near 0 or 1. Using quasi-binomial.")
+            return 'quasi_binomial'
+        else:
+            print("Detected continuous data between 0 and 1 (no boundary issues). Using beta regression.")
+            return 'beta'
+    # Continuous data between 0 and 1 (including exact 0 or 1)
+    elif (y >= 0).all() and (y <= 1).all():
+        print("Detected continuous data with boundary values (0 or 1). Using quasi-binomial.")
+        return 'quasi_binomial'
+    # Check if the data is normally distributed for OLS suitability
+    stat, p_value = stats.normaltest(y)  # D’Agostino and Pearson’s test for normality
+    print(f"Normality test p-value: {p_value:.4f}")
+    if p_value > 0.05:
+        print("Detected normally distributed data. Using OLS.")
+        return 'ols'
+    # Check if the data fits a Beta distribution
+    if stats.kstest(y, 'beta', args=(2, 2)).pvalue > 0.05:
+        # Check if the data is close to 0 or 1 (boundary issues)
+        if np.any((y < epsilon) | (y > 1 - epsilon)):
+            print("Detected continuous data near 0 or 1. Using quasi-binomial.")
+            return 'quasi_binomial'
+        else:
+            print("Detected continuous data between 0 and 1 (no boundary issues). Using beta regression.")
+            return 'beta'
+    print("Detected non-normally distributed data. Using GLM.")
+    return 'glm'
+def pick_glm_family_and_link(y):
+    """Select the appropriate GLM family and link function based on data."""
+    if np.all((y == 0) | (y == 1)):
+        print("Binary data detected. Using Binomial family with Logit link.")
+        return sm.families.Binomial(link=sm.families.links.Logit())
+    elif (y > 0).all() and (y < 1).all():
+        print("Data strictly between 0 and 1. Beta regression recommended.")
+        raise ValueError("Use BetaModel for this data; GLM is not applicable.")
+    elif (y >= 0).all() and (y <= 1).all():
+        print("Data between 0 and 1 (including boundaries). Using Quasi-Binomial.")
+        return sm.families.Binomial(link=sm.families.links.Logit())
+    stat, p_value = normaltest(y)
+    print(f"Normality test p-value: {p_value:.4f}")
+    if p_value > 0.05:
+        print("Normally distributed data detected. Using Gaussian with Identity link.")
+        return sm.families.Gaussian(link=sm.families.links.Identity())
+    if (y >= 0).all() and np.all(y.astype(int) == y):
+        print("Count data detected. Using Poisson with Log link.")
+        return sm.families.Poisson(link=sm.families.links.Log())
+    if (y > 0).all() and kstest(y, 'invgauss', args=(1,)).pvalue > 0.05:
+        print("Inverse Gaussian distribution detected. Using InverseGaussian with Log link.")
+        return sm.families.InverseGaussian(link=sm.families.links.Log())
+    if (y >= 0).all():
+        print("Overdispersed count data detected. Using Negative Binomial with Log link.")
+        return sm.families.NegativeBinomial(link=sm.families.links.Log())
+    print("Using default Gaussian family with Identity link.")
+    return sm.families.Gaussian(link=sm.families.links.Identity())
+def regression_model(X, y, regression_type='ols', groups=None, alpha=1.0, cov_type=None):
+    def plot_regression_line(X, y, model):
+        """Helper to plot regression line for lasso and ridge models."""
+        y_pred = model.predict(X)
+        plt.scatter(X.iloc[:, 1], y, color='blue', label='Data')
+        plt.plot(X.iloc[:, 1], y_pred, color='red', label='Regression line')
+        plt.xlabel('Features')
+        plt.ylabel('Dependent Variable')
+        plt.legend()
+        plt.show()
+    def find_best_alpha(model_cls):
+        """Find optimal alpha using cross-validation."""
+        alphas = np.logspace(-5, 5, 100)  # Search over a range of alphas
+        if model_cls == 'lasso':
+            model_cv = LassoCV(alphas=alphas, cv=5).fit(X, y)
+        elif model_cls == 'ridge':
+            model_cv = RidgeCV(alphas=alphas, cv=5).fit(X, y)
+        print(f"Optimal alpha for {model_cls}: {model_cv.alpha_}")
+        return model_cv
+    # Dictionary of models
+    model_map = {
+        'ols': lambda: sm.OLS(y, X).fit(cov_type=cov_type) if cov_type else sm.OLS(y, X).fit(),
+        'glm': lambda: sm.GLM(y, X, family=pick_glm_family_and_link(y)).fit(),
+        'beta': lambda: BetaModel(endog=y, exog=X).fit(),
+        'logit': lambda: sm.Logit(y, X).fit(),
+        'probit': lambda: sm.Probit(y, X).fit(),
+        'lasso': lambda: find_best_alpha('lasso') if alpha in [0, None] else Lasso(alpha=alpha).fit(X, y),
+        'ridge': lambda: find_best_alpha('ridge') if alpha in [0, None] else Ridge(alpha=alpha).fit(X, y)
+    }
+    # Select the model based on regression_type
+    if regression_type in model_map:
+        model = model_map[regression_type]()
+    elif regression_type == 'mixed':
+        model = perform_mixed_model(y, X, groups, alpha=alpha)
+    else:
+        raise ValueError(f"Unsupported regression type {regression_type}")
+    # Plot regression line for Lasso and Ridge
+    if regression_type in ['lasso', 'ridge']:
+        plot_regression_line(X, y, model)
+    # Handle GLM-specific statistics
+    if regression_type == 'glm':
+        llf_model = model.llf  # Log-likelihood of the fitted model
+        llf_null = model.null_deviance / -2  # Log-likelihood of the null model
+        mcfadden_r2 = 1 - (llf_model / llf_null)
+        print(f"McFadden's R²: {mcfadden_r2:.4f}")
+        print(model.summary())
+    if regression_type in ['lasso', 'ridge']:
+        # Calculate the Mean Squared Error (MSE)
+        mse = mean_squared_error(y, model.predict(X))
+        print(f"{regression_type.capitalize()} Regression MSE: {mse:.4f}")
+        # Display coefficients
+        coef_df = pd.DataFrame({
+            'Feature': X.columns,
+            'Coefficient': model.coef_
+        })
+        print(coef_df)
+    return model
+def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0,
+               random_row_column_effects=False, nc='233460', pc='220950', controls=[''],
+               dst=None, cov_type=None, plot=False):
+    from spacr.plot import volcano_plot, plot_histogram
+    from spacr.ml import create_volcano_filename, check_and_clean_data, prepare_formula, scale_variables
+    # Generate the volcano filename
+    volcano_path = create_volcano_filename(csv_path, regression_type, alpha, dst)
     # Determine regression type if not specified
     if regression_type is None:
-        regression_type = 'ols' if is_normal else 'glm'
+        regression_type = check_distribution(df[dependent_variable])
+    print(f"Using regression type: {regression_type}")
     df = check_and_clean_data(df, dependent_variable)
@@ -319,102 +656,51 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
         mixed_model, coef_df = fit_mixed_model(df, formula, dst)
         model = mixed_model
     else:
-        # Regular regression models
+        # Prepare the formula
         formula = prepare_formula(dependent_variable, random_row_column_effects=False)
         y, X = dmatrices(formula, data=df, return_type='dataframe')
         # Plot histogram of the dependent variable
         plot_histogram(y, dependent_variable, dst=dst)
+        plot_histogram(df, 'fraction', dst=dst)
         # Scale the independent variables and dependent variable
-        X, y = scale_variables(X, y)
+        if regression_type in ['beta', 'quasi_binomial', 'logit']:
+            print('Data will not be scaled')
+        else:
+            X, y = scale_variables(X, y)
         # Perform the regression
         groups = df['prc'] if regression_type == 'mixed' else None
-        print(f'performing {regression_type} regression')
+        print(f'Performing {regression_type} regression')
         model = regression_model(X, y, regression_type=regression_type, groups=groups, alpha=alpha, cov_type=cov_type)
         # Process the model coefficients
         coef_df = process_model_coefficients(model, regression_type, X, y, nc, pc, controls)
+        display(coef_df)
     if plot:
         volcano_plot(coef_df, volcano_path)
+    return model, coef_df, regression_type
-    return model, coef_df
-def graph_cell_count_threshold(settings):
-    from .utils import correct_metadata_column_names
-    def _line_plot(df, x, y, log_x=False, log_y=False, title=""):
-        fig, ax = plt.subplots(figsize=(10, 6))
-        ax.plot(df[x], df[y], linestyle='-', color=(0, 0.6, 0.6), label=f"{y}")
-        ax.set_xlabel(x)
-        ax.set_ylabel(y)
-        ax.set_title(title)
-        ax.legend()
-        if log_x:
-            ax.set_xscale('log')
-        if log_y:
-            ax.set_yscale('log')
-        plt.show()
-    if isinstance(settings['score_data'], str):
-        settings['score_data'] = [settings['score_data']]
-    dfs = []
-    for i, score_data in enumerate(settings['score_data']):
-        df = pd.read_csv(score_data)
-        df = correct_metadata_column_names(df)
-        df['plate'] = f'plate{i+1}'
-        df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
-        dfs.append(df)
-    df = pd.concat(dfs, axis=0)
-    # Compute the number of cells (or scores) per well
-    cell_counts = df.groupby('prc').size().reset_index(name='cell_count')
-    # Merge the cell counts back into the original DataFrame
-    df = df.merge(cell_counts, on='prc')
-    # Generate a range of thresholds
-    thresholds = np.arange(1, df['cell_count'].max() + 1)
-    results = []
-    # Iterate over thresholds and compute score mean and variance
-    for threshold in thresholds:
-        filtered_df = df[df['cell_count'] >= threshold]
-        score_mean = filtered_df.groupby('prc')[settings['score_column']].mean().mean()
-        score_variance = filtered_df.groupby('prc')[settings['score_column']].mean().var()
-        results.append((threshold, score_mean, score_variance))
-    results_df = pd.DataFrame(results, columns=['cell_count_threshold', 'score_mean', 'score_variance'])
-    if results_df.empty:
-        raise ValueError("No valid results were found. Check your data and thresholds.")
-    closest_threshold = results_df['score_variance'].diff().abs().argmin()
-    optimal_threshold = results_df.iloc[closest_threshold]
-    print(f"Optimal Threshold: {optimal_threshold['cell_count_threshold']}")
-    print(f"Score Mean at Optimal Threshold: {optimal_threshold['score_mean']}")
-    print(f"Score Variance at Optimal Threshold: {optimal_threshold['score_variance']}")
-    _line_plot(results_df, x='cell_count_threshold', y='score_mean',
-               title='Mean Well Score vs. Cell Count Threshold')
-    _line_plot(results_df, x='cell_count_threshold', y='score_variance',
-               title='Score Variance vs. Cell Count Threshold')
+def save_summary_to_file(model, file_path='summary.csv'):
+    """
+    Save the model's summary output to a CSV or text file.
+    """
+    # Get the summary as a string
+    summary_str = model.summary().as_text()
-    return optimal_threshold['cell_count_threshold']
+    # Save it as a plain text file or CSV
+    with open(file_path, 'w') as f:
+        f.write(summary_str)
 def perform_regression(settings):
     from .plot import plot_plates
     from .utils import merge_regression_res_with_metadata, save_settings
     from .settings import get_perform_regression_default_settings
-    from .toxo import go_term_enrichment_by_column, custom_volcano_plot
+    from .toxo import go_term_enrichment_by_column, custom_volcano_plot, plot_gene_phenotypes, plot_gene_heatmaps
     from .sequencing import graph_sequencing_stats
     def _perform_regression_read_data(settings):
@@ -460,7 +746,7 @@ def perform_regression(settings):
             if not settings['class_1_threshold'] is None:
                 score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
-        reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
+        reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge', None]
         if settings['regression_type'] not in reg_types:
             print(f'Possible regression types: {reg_types}')
             raise ValueError(f"Unsupported regression type {settings['regression_type']}")
@@ -468,7 +754,7 @@ def perform_regression(settings):
         return count_data_df, score_data_df
     def _perform_regression_set_paths(settings):
         if isinstance(settings['score_data'], list):
             score_data = settings['score_data'][0]
         else:
@@ -484,7 +770,11 @@ def perform_regression(settings):
             csv_path = settings['count_data']
         settings['src'] = src
-        res_folder = os.path.join(src, 'results', score_source, settings['regression_type'])
+        if settings['regression_type'] is None:
+            res_folder = os.path.join(src, 'results', score_source, 'auto')
+        else:
+            res_folder = os.path.join(src, 'results', score_source, settings['regression_type'])
         if isinstance(settings['count_data'], list):
             res_folder = os.path.join(res_folder, 'list')
@@ -536,7 +826,10 @@ def perform_regression(settings):
     print(f"Dependent variable after clean_controls: {len(score_data_df)}")
     if settings['min_cell_count'] is None:
-        settings['min_cell_count'] = graph_cell_count_threshold(settings)
+        settings['min_cell_count'] = minimum_cell_simulation(settings, tolerance=settings['tolerance'])
+    print(f"Minimum cell count: {settings['min_cell_count']}")
+    orig_dv = settings['dependent_variable']
     dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
     print(f"Dependent variable after process_scores: {len(dependent_df)}")
@@ -551,15 +844,16 @@ def perform_regression(settings):
     merged_df = pd.merge(independent_df, dependent_df, on='prc')
+    os.makedirs(res_folder, exist_ok=True)
     data_path = os.path.join(res_folder, 'regression_data.csv')
     merged_df.to_csv(data_path, index=False)
+    print(f"Saved regression data to {data_path}")
     merged_df[['plate', 'row', 'column']] = merged_df['prc'].str.split('_', expand=True)
-    if settings['transform'] is None:
-        _ = plot_plates(score_data_df, variable=dependent_variable, grouping='mean', min_max='allq', cmap='viridis', min_count=settings['min_cell_count'], dst = res_folder)
+    _ = plot_plates(merged_df, variable=orig_dv, grouping='mean', min_max='allq', cmap='viridis', min_count=None, dst=res_folder)
-    model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], nc=settings['negative_control'], pc=settings['positive_control'], controls=settings['controls'], dst=res_folder, cov_type=settings['cov_type'])
+    model, coef_df, regression_type = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], nc=settings['negative_control'], pc=settings['positive_control'], controls=settings['controls'], dst=res_folder, cov_type=settings['cov_type'])
     coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
     coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
@@ -592,7 +886,7 @@ def perform_regression(settings):
     gene_coef_df.to_csv(results_path_gene, index=False)
     grna_coef_df.to_csv(results_path_grna, index=False)
-    if settings['regression_type'] == 'lasso':
+    if regression_type == 'lasso':
         significant = coef_df[coef_df['coefficient'] > 0]
     else:
@@ -604,8 +898,9 @@ def perform_regression(settings):
         significant.sort_values(by='coefficient', ascending=False, inplace=True)
         significant = significant[~significant['feature'].str.contains('row|column')]
-    if settings['regression_type'] == 'ols':
+    if regression_type in ['ols', 'beta']:
         print(model.summary())
+        save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
     significant.to_csv(hits_path, index=False)
@@ -633,15 +928,32 @@ def perform_regression(settings):
         base_dir = os.path.dirname(os.path.abspath(__file__))
         metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
+        display(data_path)
         if settings['volcano'] == 'all':
             print('all')
-            custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'], save_path=volcano_path, x_lim=settings['x_lim'])
+            gene_list = custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
+            display(gene_list)
         elif settings['volcano'] == 'gene':
             print('gene')
-            custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'], save_path=volcano_path, x_lim=settings['x_lim'])
+            gene_list = custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
+            display(gene_list)
         elif settings['volcano'] == 'grna':
             print('grna')
-            custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'], save_path=volcano_path, x_lim=settings['x_lim'])
+            gene_list = custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
+            display(gene_list)
+        phenotype_plot = os.path.join(res_folder,'phenotype_plot.pdf')
+        transcription_heatmap = os.path.join(res_folder,'transcription_heatmap.pdf')
+        data_GT1 = pd.read_csv(settings['metadata_files'][1], low_memory=False)
+        data_ME49 = pd.read_csv(settings['metadata_files'][0], low_memory=False)
+        columns = ['sense - Tachyzoites', 'sense - Tissue cysts', 'sense - EES1', 'sense - EES2', 'sense - EES3', 'sense - EES4', 'sense - EES5']
+        print('Plotting gene phenotypes and heatmaps')
+        print(gene_list)
+        plot_gene_phenotypes(data=data_GT1, gene_list=gene_list, save_path=phenotype_plot)
+        plot_gene_heatmaps(data=data_ME49, gene_list=gene_list, columns=columns, x_column='Gene ID', normalize=True, save_path=transcription_heatmap)
         #if len(significant) > 2:
         #    metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')

spacr 0.3.47__py3-none-any.whl → 0.3.50__py3-none-any.whl

spacr 0.3.47py3-none-any.whl → 0.3.50py3-none-any.whl