PyPI - epitome-tools - Versions diffs - 0.0.1__tar.gz - Mend

epitome-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

epitome_tools-0.0.1/PKG-INFO +14 -0
epitome_tools-0.0.1/epitome_tools/__init__.py +0 -0
epitome_tools-0.0.1/epitome_tools/celltyping.py +124 -0
epitome_tools-0.0.1/epitome_tools/doublets.py +116 -0
epitome_tools-0.0.1/epitome_tools/workflow.py +169 -0
epitome_tools-0.0.1/epitome_tools.egg-info/PKG-INFO +14 -0
epitome_tools-0.0.1/epitome_tools.egg-info/SOURCES.txt +10 -0
epitome_tools-0.0.1/epitome_tools.egg-info/dependency_links.txt +1 -0
epitome_tools-0.0.1/epitome_tools.egg-info/requires.txt +3 -0
epitome_tools-0.0.1/epitome_tools.egg-info/top_level.txt +1 -0
epitome_tools-0.0.1/setup.cfg +4 -0
epitome_tools-0.0.1/setup.py +30 -0

epitome_tools-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.1
+Name: epitome_tools
+Version: 0.0.1
+Summary: Auxiliary tools for the Consensus Pituitary Atlas
+Author: Bence Kover
+Author-email: <kover.bence@gmail.com>
+Keywords: xgboost,annotation,celltype,doublet
+Classifier: Development Status :: 1 - Planning
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: MacOS :: MacOS X
+Requires-Dist: xgboost
+Requires-Dist: scipy
+Requires-Dist: numpy

epitome_tools-0.0.1/epitome_tools/__init__.py ADDED Viewed

File without changes

epitome_tools-0.0.1/epitome_tools/celltyping.py ADDED Viewed

@@ -0,0 +1,124 @@
+import xgboost as xgb
+import joblib
+from pathlib import Path
+import scipy.sparse as sp
+import numpy as np
+def load_celltype_model(model_path,label_encoder_path):
+    """
+    Load the XGBoost model from the specified path.
+    """
+    model = xgb.XGBClassifier()
+    model.load_model(model_path)
+    # Access the booster and retrieve the feature names
+    booster = model.get_booster()
+    # Get the feature names (these should match the features used for training)
+    feature_names = model.feature_names_in_
+    label_encoder = joblib.load(label_encoder_path)
+    return model, label_encoder, feature_names
+def prepare_matrix_celltype(adata, feature_names,active_assay="sc"):
+    """
+    Prepare the AnnData object for prediction by selecting the relevant features.
+    """
+    adata_to_handle = adata.copy()
+    # Define all potential assay features based on feature_names_sorted1
+    assay_features = [f for f in feature_names if f.startswith('assay_')]
+    #number of cells
+    n_obs = adata_to_handle.n_obs
+    #initialise assay data with zeros
+    assay_data1 = np.zeros((n_obs, len(assay_features)), dtype=np.float32)
+    # Create a mapping from assay feature name to its column index
+    assay_feature_indices1 = {name: i for i, name in enumerate(assay_features)}
+    #active assay must be one of sc, sn, multi_rna, if neither, set it as sc
+    if active_assay not in ['sc', 'sn', 'multi_rna']:
+        print(f"Warning: Active assay '{active_assay}' not recognized. Defaulting to 'sc'.")
+        active_assay = 'sc'
+    if active_assay in assay_feature_indices1:
+        assay_data1[:, assay_feature_indices1[active_assay]] = 1.0
+    # Convert assay data to sparse if original data is sparse
+    assay_data_matrix1 = sp.csr_matrix(assay_data1) if sp.issparse(adata_to_handle.X) else assay_data1
+    # --- 2. Combine Gene Expression and Assay Features for Model 1 ---
+    # We need to combine these temporarily to easily subset later
+    # Ensure original data is CSR for efficient column slicing if sparse
+    if sp.issparse(adata_to_handle.X) and not isinstance(adata_to_handle.X, sp.csr_matrix):
+        adata_X = adata_to_handle.X.tocsr().copy()
+        print("Converted adata_orig.X to CSR format.")
+    else:
+        adata_X = adata_to_handle.X.copy()
+    # Combine the matrices horizontally
+    combined_X1 = sp.hstack([adata_X, assay_data_matrix1], format='csr') if sp.issparse(adata_X) else np.hstack([adata_X, assay_data1])
+    # Create combined feature names list
+    combined_feature_names1 = adata_to_handle.var_names.tolist() + assay_features
+    # Create a mapping from the combined feature names to their column index
+    combined_feature_indices1 = {name: i for i, name in enumerate(combined_feature_names1)}
+    print(f"Combined matrix shape for model 1: {combined_X1.shape}")
+    # Initialize the final matrix with NaNs (XGBoost can handle NaNs)
+    X_final1 = np.full((n_obs, len(feature_names)), np.nan, dtype=np.float32)
+    # Create a mapping for the target feature order
+    target_feature_indices1 = {name: i for i, name in enumerate(feature_names)}
+    # Find which features required by the model are present in our combined data
+    available_features1 = [f for f in feature_names if f in combined_feature_indices1]
+    missing_features1 = [f for f in feature_names if f not in combined_feature_indices1]
+    if missing_features1:
+        print(f"Warning: {len(missing_features1)} features required by model 1 are missing from the data: {missing_features1[:5]}...") # Print first 5
+    print(f"Found {len(available_features1)} available features out of {len(feature_names)} required for model 1.")
+    # Get the column indices in the *combined* data for the available features - indices of where it is in the combined matrix
+    source_indices1 = [combined_feature_indices1[f] for f in available_features1]
+    # Get the column indices in the *final* matrix for these available features - indices of where the model expects it
+    target_indices1 = [target_feature_indices1[f] for f in available_features1]
+    # Fill the final matrix with data from the available features
+    # Ensure data is dense for assignment; handle potential memory issues for large datasets
+    if sp.issparse(combined_X1):
+        # Slice sparse matrix efficiently and convert to dense for assignment
+        X_final1[:, target_indices1] = combined_X1[:, source_indices1].toarray()
+        print("Filled final matrix for model 1 from sparse data.")
+    else:
+        X_final1[:, target_indices1] = combined_X1[:, source_indices1]
+        print("Filled final matrix for model 1 from dense data.")
+    return X_final1
+def perform_celltype_prediction(matrix, model, label_encoder, return_probas=True):
+    """
+    Perform cell type prediction using the provided model and label encoder.
+    """
+    probas = model.predict_proba(matrix)
+    predicted_labels = model.predict(matrix)
+    predicted_cell_types = label_encoder.inverse_transform(predicted_labels)
+    if return_probas:
+        return predicted_cell_types, probas
+    else:
+        return predicted_cell_types

epitome_tools-0.0.1/epitome_tools/doublets.py ADDED Viewed

@@ -0,0 +1,116 @@
+import numpy as np
+import scipy.sparse as sp
+import joblib
+import xgboost as xgb
+from pathlib import Path
+def load_doublet_model(model_path,label_encoder_path,threshold_path):
+    """
+    Load the XGBoost model for doublet prediction from the specified path.
+    """
+    model = xgb.XGBClassifier()
+    model.load_model(model_path)
+    # Access the booster and retrieve the feature names
+    booster = model.get_booster()
+    # Get the feature names (these should match the features used for training)
+    feature_names = model.feature_names_in_
+    label_encoder = joblib.load(label_encoder_path)
+    threshold = joblib.load(threshold_path)
+    return model, label_encoder, threshold, feature_names
+def prepare_matrix_doublet(adata, feature_names):
+    adata_to_handle = adata.copy()
+    # Define all potential assay features based on feature_names_sorted1
+    assay_features = [f for f in feature_names if f.startswith('total_')]
+    #number of cells
+    n_obs = adata_to_handle.n_obs
+    #initialise assay data with zeros
+    assay_data = np.zeros((n_obs, len(assay_features)), dtype=np.float32)
+    # Create a mapping from assay feature name to its column index
+    assay_feature_indices = {name: i for i, name in enumerate(assay_features)}
+    # Convert assay data to sparse if original data is sparse
+    assay_data_matrix = sp.csr_matrix(assay_data) if sp.issparse(adata_to_handle.X) else assay_data
+    # --- 2. Combine Gene Expression and Assay Features for Model 1 ---
+    # We need to combine these temporarily to easily subset later
+    # Ensure original data is CSR for efficient column slicing if sparse
+    if sp.issparse(adata_to_handle.X) and not isinstance(adata_to_handle.X, sp.csr_matrix):
+        adata_X = adata_to_handle.X.tocsr().copy()
+        print("Converted adata_orig.X to CSR format.")
+    else:
+        adata_X = adata_to_handle.X.copy()
+    # Combine the matrices horizontally
+    combined_X1 = sp.hstack([adata_X, assay_data_matrix], format='csr') if sp.issparse(adata_X) else np.hstack([adata_X, assay_data])
+    # Create combined feature names list
+    combined_feature_names = adata_to_handle.var_names.tolist() + assay_features
+    # Create a mapping from the combined feature names to their column index
+    combined_feature_indices = {name: i for i, name in enumerate(combined_feature_names)}
+    print(f"Combined matrix shape for model 1: {combined_X1.shape}")
+    X_final = np.full((n_obs, len(feature_names)), np.nan, dtype=np.float32)
+    target_feature_indices = {name: i for i, name in enumerate(feature_names)}
+    # Reuse combined_feature_indices1, available_features1, source_indices1 from Model 1
+    available_features = [f for f in feature_names if f in combined_feature_indices]
+    missing_features = [f for f in feature_names if f not in combined_feature_indices]
+    if missing_features:
+        print(f"Warning: {len(missing_features)} features required by model 2 are missing: {missing_features[:5]}...")
+    print(f"Found {len(available_features)} available features out of {len(feature_names)} required for model 2.")
+    source_indices = [combined_feature_indices[f] for f in available_features] # Use combined_feature_indices1
+    target_indices = [target_feature_indices[f] for f in available_features]
+    if sp.issparse(combined_X1):
+        # Slice sparse matrix efficiently and convert to dense for assignment
+        X_final[:, target_indices] = combined_X1[:, source_indices].toarray()
+        print("Filled final matrix for model 1 from sparse data.")
+    else:
+        X_final[:, target_indices] = combined_X1[:, source_indices]
+        print("Filled final matrix for model 1 from dense data.")
+    return X_final
+def perform_doublet_prediction(matrix, model, label_encoder, threshold):
+    n_obs = matrix.shape[0]
+    predicted_labels = model.predict(matrix)
+    predicted_doublet_labels = label_encoder.inverse_transform(predicted_labels)
+    probas = model.predict_proba(matrix) # Get probabilities
+    print("Doublet prediction with model complete.")
+    is_doublet = np.full(n_obs, False, dtype=bool) # Default: not a doublet
+    doublet_score = np.zeros(n_obs)
+    if probas.shape[1] > 1:
+      doublet_probabilities = probas[:, 1]
+    elif probas.shape[1] == 1:
+       doublet_probabilities = probas[:, 0]
+    else:
+       doublet_probabilities = np.zeros(n_obs)
+       print("Warning: Model 2 probas has shape < 1")
+    for i in range(n_obs):
+        if predicted_doublet_labels[i] == 'doublet': # Only check if model2 predicts doublet
+            doublet_score[i] = doublet_probabilities[i]
+            if doublet_probabilities[i] < threshold:
+                is_doublet[i] = True
+    return predicted_doublet_labels, is_doublet, doublet_score

epitome_tools-0.0.1/epitome_tools/workflow.py ADDED Viewed

@@ -0,0 +1,169 @@
+import numpy as np
+import scipy.sparse as sp
+import joblib
+from pathlib import Path
+import xgboost as xgb
+from celltyping import load_celltype_model, prepare_matrix_celltype, perform_celltype_prediction
+from doublets import load_doublet_model, prepare_matrix_doublet, perform_doublet_prediction
+def check_sample_compatibility_features(adata, feature_names, return_present=True,
+                                        return_missing=True):
+    varnames = adata.var_names
+    #check percentage of features that are present in the adata object
+    present = np.isin(feature_names, varnames)
+    present_percentage = np.sum(present) / len(feature_names) * 100
+    print(f"Percentage of features present in adata: {present_percentage:.2f}%")
+    if return_present:
+        present_features = feature_names[present]
+        print(f"Number of features present in adata: {len(present_features)}")
+    else:
+        present_features = None
+    if return_missing:
+        missing_features = feature_names[~present]
+        print(f"Number of features missing in adata: {len(missing_features)}")
+    else:
+        missing_features = None
+    #pass if at least 70% of the features are present
+    if present_percentage < 70:
+        print("Warning: Less than 70% of the features are present in the adata object.")
+        passing = False
+    else:
+        print("At least 70% of the features are present in the adata object.")
+        passing = True
+    if return_present and return_missing:
+        return passing, present_features, missing_features
+    elif return_present:
+        return passing, present_features
+    elif return_missing:
+        return passing, missing_features
+    else:
+        return passing
+def check_sample_compatibility_normalization(adata, force=False):
+    # Check if the model is for RNA or ATAC
+    #check number of cells, if more than 50000, throw an error
+    if adata.shape[0] > 50000:
+        print(f"Error: The number of cells in the dataset is greater than 50000. Are you sure this has been filtered correctly?")
+    #take info from first 10 cells
+    first_10_cells = adata[:10, :].X
+    #check if they have integer values or if 1.0 occurs more than 10% of the time
+    integer_type = np.issubdtype(first_10_cells.dtype, np.integer)
+    if integer_type:
+        print(f"Warning: The dataset appears to be in integer format. Are you sure this has been normalized correctly?")
+    one_values = np.sum(first_10_cells == 1.0) / first_10_cells.size
+    if one_values > 0.1:
+        print(f"Warning: The dataset has more than 10% of the values equal to 1.0. Are you sure this has been normalized correctly?")
+    #if cells sum to nearly 10k, say it hasnt been logged
+    not_logged = False
+    if np.all(np.sum(first_10_cells, axis=1) > 9000) and np.all(np.sum(first_10_cells, axis=1) < 11000):
+        print(f"Warning: Have you logged the dataset? The cells sum to nearly 10k.")
+        not_logged = True
+    #if force is True, return True
+    passing = False
+    if force:
+        print(f"Warning: Force is set to True. Passing the dataset compatibility check.")
+        passing = True
+    elif not integer_type and one_values < 0.1 and not not_logged:
+        print(f"The dataset has passed the compatibility check.")
+        passing = True
+    return passing
+def get_base_path():
+    """Get the absolute path to the project root directory."""
+    # Check for environment variable first
+    return Path(__file__).parent
+def cell_type_workflow(adata, active_assay="sc",modality="rna",in_place=True):
+    """
+    Main workflow for cell type prediction.
+    """
+    base_path = get_base_path()
+    if modality == "rna":
+        model_path = f"{base_path}/models/rna_model_full_data_fixed_params_weighted_0526.json"
+        label_encoder_path = f'{base_path}/models/label_encoder_full_data_fixed_params_weighted_0430.pkl'
+    elif modality == "atac":
+        model_path = f"/{base_path}/models/atac_model_full_data_fixed_params_weighted_0526.json"
+        label_encoder_path = f'{base_path}/models/label_encoder_atac_full_data_fixed_params_weighted_0430.pkl'
+    model, label_encoder, feature_names = load_celltype_model(model_path, label_encoder_path)
+    #checks
+    check_sample_compatibility_features(adata, feature_names, return_present=False, return_missing=False)
+    check_sample_compatibility_normalization(adata, force=False)
+    # Prepare the matrix for cell type prediction
+    X_final = prepare_matrix_celltype(adata, feature_names, active_assay=active_assay)
+    # Perform cell type prediction
+    predicted_cell_types, probas = perform_celltype_prediction(X_final, model, label_encoder)
+    # Add predictions to adata
+    if in_place:
+        adata.obs['predicted_cell_type'] = predicted_cell_types
+        adata.obs['predicted_cell_type_proba'] = probas.max(axis=1)  # Store max probability
+        return adata
+    else:
+        return predicted_cell_types
+def doublet_workflow(adata,modality,in_place=True):
+    if modality == "rna":
+        model_path = "/Users/k23030440/xgboost/rna_model_binary_full_data_fixed_params_weighted_0515.json"
+        label_encoder_path = '/Users/k23030440/xgboost/label_encoder_binary_full_data_fixed_params_weighted_0515.pkl'
+        threshold_path = '/Users/k23030440/xgboost/final_threshold_0515.pkl'
+    elif modality == "atac":
+        model_path = "/Users/k23030440/xgboost/atac_model_binary_full_data_fixed_params_weighted_0515.json"
+        label_encoder_path = '/Users/k23030440/xgboost/label_encoder_binary_atac_full_data_fixed_params_weighted_0515.pkl'
+        threshold_path = '/Users/k23030440/xgboost/final_threshold_atac_0515.pkl'
+    model, label_encoder, threshold, feature_names = load_doublet_model(model_path, label_encoder_path, threshold_path)
+    #checks
+    check_sample_compatibility_features(adata, feature_names, return_present=False, return_missing=False)
+    check_sample_compatibility_normalization(adata, force=False)
+    # Prepare the matrix for doublet prediction
+    X_final = prepare_matrix_doublet(adata, feature_names)
+    # Perform doublet prediction
+    predicted_doublet_labels, is_doublet, doublet_score = perform_doublet_prediction(X_final, model, label_encoder, threshold)
+    # Add predictions to adata
+    if in_place:
+        adata.obs['predicted_doublet'] = predicted_doublet_labels
+        adata.obs['is_doublet'] = is_doublet
+        adata.obs['doublet_score'] = doublet_score
+        return adata
+    else:
+        return predicted_doublet_labels
+def celltype_doublet_workflow(adata, active_assay="sc", modality="rna", in_place=True):
+    """
+    Main workflow for cell type and doublet prediction.
+    """
+    adata = cell_type_workflow(adata, active_assay=active_assay, modality=modality, in_place=in_place)
+    adata = doublet_workflow(adata, modality=modality, in_place=in_place)
+    return adata

epitome_tools-0.0.1/epitome_tools.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.1
+Name: epitome_tools
+Version: 0.0.1
+Summary: Auxiliary tools for the Consensus Pituitary Atlas
+Author: Bence Kover
+Author-email: <kover.bence@gmail.com>
+Keywords: xgboost,annotation,celltype,doublet
+Classifier: Development Status :: 1 - Planning
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: MacOS :: MacOS X
+Requires-Dist: xgboost
+Requires-Dist: scipy
+Requires-Dist: numpy

epitome_tools-0.0.1/epitome_tools.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,10 @@
+setup.py
+epitome_tools/__init__.py
+epitome_tools/celltyping.py
+epitome_tools/doublets.py
+epitome_tools/workflow.py
+epitome_tools.egg-info/PKG-INFO
+epitome_tools.egg-info/SOURCES.txt
+epitome_tools.egg-info/dependency_links.txt
+epitome_tools.egg-info/requires.txt
+epitome_tools.egg-info/top_level.txt

epitome_tools-0.0.1/epitome_tools.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

epitome_tools-0.0.1/epitome_tools.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+xgboost
+scipy
+numpy

epitome_tools-0.0.1/epitome_tools.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ epitome_tools

epitome_tools-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

epitome_tools-0.0.1/setup.py ADDED Viewed

@@ -0,0 +1,30 @@
+#this is a setup.py file
+from setuptools import setup, find_packages
+VERSION = '0.0.1'
+DESCRIPTION = 'Auxiliary tools for the Consensus Pituitary Atlas'
+LONG_DESCRIPTION = 'Python package containing auxiliary tools for the Consensus Pituitary Atlas. The current workflow commands allow celltyping and doublet detection with a single line of code, for more information see https://github.com/BKover99/epitome_tools'
+setup(
+    name="epitome_tools",
+    version=VERSION,
+    author="Bence Kover",
+    author_email="<kover.bence@gmail.com>",
+    description=DESCRIPTION,
+    packages=find_packages(),
+    install_requires=[
+	'xgboost',
+	'scipy',
+	'numpy'
+	],
+    keywords=['xgboost', 'annotation', 'celltype', 'doublet'],
+    classifiers=[
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Science/Research",
+        "Programming Language :: Python :: 3",
+        "Operating System :: MacOS :: MacOS X"
+    ]
+)