epitome-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.1
2
+ Name: epitome_tools
3
+ Version: 0.0.1
4
+ Summary: Auxiliary tools for the Consensus Pituitary Atlas
5
+ Author: Bence Kover
6
+ Author-email: <kover.bence@gmail.com>
7
+ Keywords: xgboost,annotation,celltype,doublet
8
+ Classifier: Development Status :: 1 - Planning
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: MacOS :: MacOS X
12
+ Requires-Dist: xgboost
13
+ Requires-Dist: scipy
14
+ Requires-Dist: numpy
File without changes
@@ -0,0 +1,124 @@
1
+ import xgboost as xgb
2
+ import joblib
3
+ from pathlib import Path
4
+ import scipy.sparse as sp
5
+ import numpy as np
6
+
7
+
8
+ def load_celltype_model(model_path,label_encoder_path):
9
+ """
10
+ Load the XGBoost model from the specified path.
11
+ """
12
+ model = xgb.XGBClassifier()
13
+ model.load_model(model_path)
14
+
15
+ # Access the booster and retrieve the feature names
16
+ booster = model.get_booster()
17
+
18
+ # Get the feature names (these should match the features used for training)
19
+ feature_names = model.feature_names_in_
20
+
21
+ label_encoder = joblib.load(label_encoder_path)
22
+
23
+ return model, label_encoder, feature_names
24
+
25
+
26
+
27
+
28
+ def prepare_matrix_celltype(adata, feature_names,active_assay="sc"):
29
+ """
30
+ Prepare the AnnData object for prediction by selecting the relevant features.
31
+ """
32
+
33
+ adata_to_handle = adata.copy()
34
+
35
+ # Define all potential assay features based on feature_names_sorted1
36
+ assay_features = [f for f in feature_names if f.startswith('assay_')]
37
+
38
+ #number of cells
39
+ n_obs = adata_to_handle.n_obs
40
+
41
+ #initialise assay data with zeros
42
+ assay_data1 = np.zeros((n_obs, len(assay_features)), dtype=np.float32)
43
+
44
+ # Create a mapping from assay feature name to its column index
45
+ assay_feature_indices1 = {name: i for i, name in enumerate(assay_features)}
46
+
47
+ #active assay must be one of sc, sn, multi_rna, if neither, set it as sc
48
+ if active_assay not in ['sc', 'sn', 'multi_rna']:
49
+ print(f"Warning: Active assay '{active_assay}' not recognized. Defaulting to 'sc'.")
50
+ active_assay = 'sc'
51
+
52
+ if active_assay in assay_feature_indices1:
53
+ assay_data1[:, assay_feature_indices1[active_assay]] = 1.0
54
+
55
+ # Convert assay data to sparse if original data is sparse
56
+ assay_data_matrix1 = sp.csr_matrix(assay_data1) if sp.issparse(adata_to_handle.X) else assay_data1
57
+
58
+ # --- 2. Combine Gene Expression and Assay Features for Model 1 ---
59
+ # We need to combine these temporarily to easily subset later
60
+
61
+ # Ensure original data is CSR for efficient column slicing if sparse
62
+ if sp.issparse(adata_to_handle.X) and not isinstance(adata_to_handle.X, sp.csr_matrix):
63
+ adata_X = adata_to_handle.X.tocsr().copy()
64
+ print("Converted adata_orig.X to CSR format.")
65
+ else:
66
+ adata_X = adata_to_handle.X.copy()
67
+
68
+ # Combine the matrices horizontally
69
+ combined_X1 = sp.hstack([adata_X, assay_data_matrix1], format='csr') if sp.issparse(adata_X) else np.hstack([adata_X, assay_data1])
70
+
71
+ # Create combined feature names list
72
+ combined_feature_names1 = adata_to_handle.var_names.tolist() + assay_features
73
+
74
+ # Create a mapping from the combined feature names to their column index
75
+ combined_feature_indices1 = {name: i for i, name in enumerate(combined_feature_names1)}
76
+ print(f"Combined matrix shape for model 1: {combined_X1.shape}")
77
+
78
+ # Initialize the final matrix with NaNs (XGBoost can handle NaNs)
79
+ X_final1 = np.full((n_obs, len(feature_names)), np.nan, dtype=np.float32)
80
+
81
+ # Create a mapping for the target feature order
82
+ target_feature_indices1 = {name: i for i, name in enumerate(feature_names)}
83
+
84
+ # Find which features required by the model are present in our combined data
85
+ available_features1 = [f for f in feature_names if f in combined_feature_indices1]
86
+ missing_features1 = [f for f in feature_names if f not in combined_feature_indices1]
87
+
88
+ if missing_features1:
89
+ print(f"Warning: {len(missing_features1)} features required by model 1 are missing from the data: {missing_features1[:5]}...") # Print first 5
90
+
91
+ print(f"Found {len(available_features1)} available features out of {len(feature_names)} required for model 1.")
92
+
93
+ # Get the column indices in the *combined* data for the available features - indices of where it is in the combined matrix
94
+ source_indices1 = [combined_feature_indices1[f] for f in available_features1]
95
+
96
+ # Get the column indices in the *final* matrix for these available features - indices of where the model expects it
97
+ target_indices1 = [target_feature_indices1[f] for f in available_features1]
98
+
99
+ # Fill the final matrix with data from the available features
100
+ # Ensure data is dense for assignment; handle potential memory issues for large datasets
101
+ if sp.issparse(combined_X1):
102
+ # Slice sparse matrix efficiently and convert to dense for assignment
103
+ X_final1[:, target_indices1] = combined_X1[:, source_indices1].toarray()
104
+ print("Filled final matrix for model 1 from sparse data.")
105
+ else:
106
+ X_final1[:, target_indices1] = combined_X1[:, source_indices1]
107
+ print("Filled final matrix for model 1 from dense data.")
108
+
109
+
110
+ return X_final1
111
+
112
+
113
+ def perform_celltype_prediction(matrix, model, label_encoder, return_probas=True):
114
+ """
115
+ Perform cell type prediction using the provided model and label encoder.
116
+ """
117
+ probas = model.predict_proba(matrix)
118
+ predicted_labels = model.predict(matrix)
119
+ predicted_cell_types = label_encoder.inverse_transform(predicted_labels)
120
+
121
+ if return_probas:
122
+ return predicted_cell_types, probas
123
+ else:
124
+ return predicted_cell_types
@@ -0,0 +1,116 @@
1
+ import numpy as np
2
+ import scipy.sparse as sp
3
+ import joblib
4
+ import xgboost as xgb
5
+ from pathlib import Path
6
+
7
+
8
+ def load_doublet_model(model_path,label_encoder_path,threshold_path):
9
+ """
10
+ Load the XGBoost model for doublet prediction from the specified path.
11
+ """
12
+ model = xgb.XGBClassifier()
13
+ model.load_model(model_path)
14
+
15
+ # Access the booster and retrieve the feature names
16
+ booster = model.get_booster()
17
+
18
+ # Get the feature names (these should match the features used for training)
19
+ feature_names = model.feature_names_in_
20
+
21
+ label_encoder = joblib.load(label_encoder_path)
22
+
23
+ threshold = joblib.load(threshold_path)
24
+
25
+ return model, label_encoder, threshold, feature_names
26
+
27
+
28
+ def prepare_matrix_doublet(adata, feature_names):
29
+
30
+ adata_to_handle = adata.copy()
31
+
32
+ # Define all potential assay features based on feature_names_sorted1
33
+ assay_features = [f for f in feature_names if f.startswith('total_')]
34
+
35
+ #number of cells
36
+ n_obs = adata_to_handle.n_obs
37
+
38
+ #initialise assay data with zeros
39
+ assay_data = np.zeros((n_obs, len(assay_features)), dtype=np.float32)
40
+
41
+ # Create a mapping from assay feature name to its column index
42
+ assay_feature_indices = {name: i for i, name in enumerate(assay_features)}
43
+
44
+ # Convert assay data to sparse if original data is sparse
45
+ assay_data_matrix = sp.csr_matrix(assay_data) if sp.issparse(adata_to_handle.X) else assay_data
46
+
47
+ # --- 2. Combine Gene Expression and Assay Features for Model 1 ---
48
+ # We need to combine these temporarily to easily subset later
49
+
50
+ # Ensure original data is CSR for efficient column slicing if sparse
51
+ if sp.issparse(adata_to_handle.X) and not isinstance(adata_to_handle.X, sp.csr_matrix):
52
+ adata_X = adata_to_handle.X.tocsr().copy()
53
+ print("Converted adata_orig.X to CSR format.")
54
+ else:
55
+ adata_X = adata_to_handle.X.copy()
56
+
57
+ # Combine the matrices horizontally
58
+ combined_X1 = sp.hstack([adata_X, assay_data_matrix], format='csr') if sp.issparse(adata_X) else np.hstack([adata_X, assay_data])
59
+
60
+ # Create combined feature names list
61
+ combined_feature_names = adata_to_handle.var_names.tolist() + assay_features
62
+
63
+ # Create a mapping from the combined feature names to their column index
64
+ combined_feature_indices = {name: i for i, name in enumerate(combined_feature_names)}
65
+ print(f"Combined matrix shape for model 1: {combined_X1.shape}")
66
+
67
+ X_final = np.full((n_obs, len(feature_names)), np.nan, dtype=np.float32)
68
+ target_feature_indices = {name: i for i, name in enumerate(feature_names)}
69
+
70
+ # Reuse combined_feature_indices1, available_features1, source_indices1 from Model 1
71
+ available_features = [f for f in feature_names if f in combined_feature_indices]
72
+ missing_features = [f for f in feature_names if f not in combined_feature_indices]
73
+
74
+ if missing_features:
75
+ print(f"Warning: {len(missing_features)} features required by model 2 are missing: {missing_features[:5]}...")
76
+
77
+ print(f"Found {len(available_features)} available features out of {len(feature_names)} required for model 2.")
78
+ source_indices = [combined_feature_indices[f] for f in available_features] # Use combined_feature_indices1
79
+
80
+ target_indices = [target_feature_indices[f] for f in available_features]
81
+
82
+ if sp.issparse(combined_X1):
83
+ # Slice sparse matrix efficiently and convert to dense for assignment
84
+ X_final[:, target_indices] = combined_X1[:, source_indices].toarray()
85
+ print("Filled final matrix for model 1 from sparse data.")
86
+ else:
87
+ X_final[:, target_indices] = combined_X1[:, source_indices]
88
+ print("Filled final matrix for model 1 from dense data.")
89
+ return X_final
90
+
91
+ def perform_doublet_prediction(matrix, model, label_encoder, threshold):
92
+
93
+ n_obs = matrix.shape[0]
94
+ predicted_labels = model.predict(matrix)
95
+ predicted_doublet_labels = label_encoder.inverse_transform(predicted_labels)
96
+ probas = model.predict_proba(matrix) # Get probabilities
97
+ print("Doublet prediction with model complete.")
98
+ is_doublet = np.full(n_obs, False, dtype=bool) # Default: not a doublet
99
+ doublet_score = np.zeros(n_obs)
100
+
101
+ if probas.shape[1] > 1:
102
+ doublet_probabilities = probas[:, 1]
103
+ elif probas.shape[1] == 1:
104
+ doublet_probabilities = probas[:, 0]
105
+ else:
106
+ doublet_probabilities = np.zeros(n_obs)
107
+ print("Warning: Model 2 probas has shape < 1")
108
+
109
+ for i in range(n_obs):
110
+ if predicted_doublet_labels[i] == 'doublet': # Only check if model2 predicts doublet
111
+ doublet_score[i] = doublet_probabilities[i]
112
+ if doublet_probabilities[i] < threshold:
113
+ is_doublet[i] = True
114
+
115
+
116
+ return predicted_doublet_labels, is_doublet, doublet_score
@@ -0,0 +1,169 @@
1
+ import numpy as np
2
+ import scipy.sparse as sp
3
+ import joblib
4
+ from pathlib import Path
5
+ import xgboost as xgb
6
+ from celltyping import load_celltype_model, prepare_matrix_celltype, perform_celltype_prediction
7
+ from doublets import load_doublet_model, prepare_matrix_doublet, perform_doublet_prediction
8
+
9
+
10
+ def check_sample_compatibility_features(adata, feature_names, return_present=True,
11
+ return_missing=True):
12
+ varnames = adata.var_names
13
+
14
+ #check percentage of features that are present in the adata object
15
+ present = np.isin(feature_names, varnames)
16
+ present_percentage = np.sum(present) / len(feature_names) * 100
17
+ print(f"Percentage of features present in adata: {present_percentage:.2f}%")
18
+ if return_present:
19
+ present_features = feature_names[present]
20
+ print(f"Number of features present in adata: {len(present_features)}")
21
+ else:
22
+ present_features = None
23
+ if return_missing:
24
+ missing_features = feature_names[~present]
25
+ print(f"Number of features missing in adata: {len(missing_features)}")
26
+ else:
27
+ missing_features = None
28
+
29
+ #pass if at least 70% of the features are present
30
+ if present_percentage < 70:
31
+ print("Warning: Less than 70% of the features are present in the adata object.")
32
+ passing = False
33
+ else:
34
+ print("At least 70% of the features are present in the adata object.")
35
+ passing = True
36
+
37
+
38
+ if return_present and return_missing:
39
+ return passing, present_features, missing_features
40
+
41
+ elif return_present:
42
+ return passing, present_features
43
+ elif return_missing:
44
+ return passing, missing_features
45
+ else:
46
+ return passing
47
+
48
+
49
+
50
+
51
+ def check_sample_compatibility_normalization(adata, force=False):
52
+ # Check if the model is for RNA or ATAC
53
+ #check number of cells, if more than 50000, throw an error
54
+ if adata.shape[0] > 50000:
55
+ print(f"Error: The number of cells in the dataset is greater than 50000. Are you sure this has been filtered correctly?")
56
+
57
+ #take info from first 10 cells
58
+ first_10_cells = adata[:10, :].X
59
+ #check if they have integer values or if 1.0 occurs more than 10% of the time
60
+ integer_type = np.issubdtype(first_10_cells.dtype, np.integer)
61
+ if integer_type:
62
+ print(f"Warning: The dataset appears to be in integer format. Are you sure this has been normalized correctly?")
63
+
64
+ one_values = np.sum(first_10_cells == 1.0) / first_10_cells.size
65
+ if one_values > 0.1:
66
+ print(f"Warning: The dataset has more than 10% of the values equal to 1.0. Are you sure this has been normalized correctly?")
67
+
68
+ #if cells sum to nearly 10k, say it hasnt been logged
69
+ not_logged = False
70
+ if np.all(np.sum(first_10_cells, axis=1) > 9000) and np.all(np.sum(first_10_cells, axis=1) < 11000):
71
+ print(f"Warning: Have you logged the dataset? The cells sum to nearly 10k.")
72
+ not_logged = True
73
+
74
+
75
+ #if force is True, return True
76
+ passing = False
77
+ if force:
78
+ print(f"Warning: Force is set to True. Passing the dataset compatibility check.")
79
+ passing = True
80
+ elif not integer_type and one_values < 0.1 and not not_logged:
81
+ print(f"The dataset has passed the compatibility check.")
82
+ passing = True
83
+
84
+ return passing
85
+
86
+
87
+ def get_base_path():
88
+ """Get the absolute path to the project root directory."""
89
+ # Check for environment variable first
90
+ return Path(__file__).parent
91
+
92
+
93
+ def cell_type_workflow(adata, active_assay="sc",modality="rna",in_place=True):
94
+ """
95
+ Main workflow for cell type prediction.
96
+ """
97
+
98
+ base_path = get_base_path()
99
+ if modality == "rna":
100
+ model_path = f"{base_path}/models/rna_model_full_data_fixed_params_weighted_0526.json"
101
+ label_encoder_path = f'{base_path}/models/label_encoder_full_data_fixed_params_weighted_0430.pkl'
102
+
103
+ elif modality == "atac":
104
+ model_path = f"/{base_path}/models/atac_model_full_data_fixed_params_weighted_0526.json"
105
+ label_encoder_path = f'{base_path}/models/label_encoder_atac_full_data_fixed_params_weighted_0430.pkl'
106
+
107
+
108
+ model, label_encoder, feature_names = load_celltype_model(model_path, label_encoder_path)
109
+
110
+ #checks
111
+ check_sample_compatibility_features(adata, feature_names, return_present=False, return_missing=False)
112
+ check_sample_compatibility_normalization(adata, force=False)
113
+
114
+ # Prepare the matrix for cell type prediction
115
+ X_final = prepare_matrix_celltype(adata, feature_names, active_assay=active_assay)
116
+ # Perform cell type prediction
117
+ predicted_cell_types, probas = perform_celltype_prediction(X_final, model, label_encoder)
118
+ # Add predictions to adata
119
+ if in_place:
120
+ adata.obs['predicted_cell_type'] = predicted_cell_types
121
+ adata.obs['predicted_cell_type_proba'] = probas.max(axis=1) # Store max probability
122
+ return adata
123
+ else:
124
+ return predicted_cell_types
125
+
126
+
127
+
128
+
129
+
130
+ def doublet_workflow(adata,modality,in_place=True):
131
+ if modality == "rna":
132
+ model_path = "/Users/k23030440/xgboost/rna_model_binary_full_data_fixed_params_weighted_0515.json"
133
+ label_encoder_path = '/Users/k23030440/xgboost/label_encoder_binary_full_data_fixed_params_weighted_0515.pkl'
134
+ threshold_path = '/Users/k23030440/xgboost/final_threshold_0515.pkl'
135
+ elif modality == "atac":
136
+ model_path = "/Users/k23030440/xgboost/atac_model_binary_full_data_fixed_params_weighted_0515.json"
137
+ label_encoder_path = '/Users/k23030440/xgboost/label_encoder_binary_atac_full_data_fixed_params_weighted_0515.pkl'
138
+ threshold_path = '/Users/k23030440/xgboost/final_threshold_atac_0515.pkl'
139
+ model, label_encoder, threshold, feature_names = load_doublet_model(model_path, label_encoder_path, threshold_path)
140
+
141
+
142
+ #checks
143
+ check_sample_compatibility_features(adata, feature_names, return_present=False, return_missing=False)
144
+ check_sample_compatibility_normalization(adata, force=False)
145
+
146
+
147
+ # Prepare the matrix for doublet prediction
148
+ X_final = prepare_matrix_doublet(adata, feature_names)
149
+ # Perform doublet prediction
150
+ predicted_doublet_labels, is_doublet, doublet_score = perform_doublet_prediction(X_final, model, label_encoder, threshold)
151
+ # Add predictions to adata
152
+ if in_place:
153
+ adata.obs['predicted_doublet'] = predicted_doublet_labels
154
+ adata.obs['is_doublet'] = is_doublet
155
+ adata.obs['doublet_score'] = doublet_score
156
+ return adata
157
+ else:
158
+ return predicted_doublet_labels
159
+
160
+
161
+
162
+
163
+ def celltype_doublet_workflow(adata, active_assay="sc", modality="rna", in_place=True):
164
+ """
165
+ Main workflow for cell type and doublet prediction.
166
+ """
167
+ adata = cell_type_workflow(adata, active_assay=active_assay, modality=modality, in_place=in_place)
168
+ adata = doublet_workflow(adata, modality=modality, in_place=in_place)
169
+ return adata
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.1
2
+ Name: epitome_tools
3
+ Version: 0.0.1
4
+ Summary: Auxiliary tools for the Consensus Pituitary Atlas
5
+ Author: Bence Kover
6
+ Author-email: <kover.bence@gmail.com>
7
+ Keywords: xgboost,annotation,celltype,doublet
8
+ Classifier: Development Status :: 1 - Planning
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: MacOS :: MacOS X
12
+ Requires-Dist: xgboost
13
+ Requires-Dist: scipy
14
+ Requires-Dist: numpy
@@ -0,0 +1,10 @@
1
+ setup.py
2
+ epitome_tools/__init__.py
3
+ epitome_tools/celltyping.py
4
+ epitome_tools/doublets.py
5
+ epitome_tools/workflow.py
6
+ epitome_tools.egg-info/PKG-INFO
7
+ epitome_tools.egg-info/SOURCES.txt
8
+ epitome_tools.egg-info/dependency_links.txt
9
+ epitome_tools.egg-info/requires.txt
10
+ epitome_tools.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+ xgboost
2
+ scipy
3
+ numpy
@@ -0,0 +1 @@
1
+ epitome_tools
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,30 @@
1
+ #this is a setup.py file
2
+ from setuptools import setup, find_packages
3
+
4
+
5
+ VERSION = '0.0.1'
6
+ DESCRIPTION = 'Auxiliary tools for the Consensus Pituitary Atlas'
7
+ LONG_DESCRIPTION = 'Python package containing auxiliary tools for the Consensus Pituitary Atlas. The current workflow commands allow celltyping and doublet detection with a single line of code, for more information see https://github.com/BKover99/epitome_tools'
8
+
9
+
10
+ setup(
11
+ name="epitome_tools",
12
+ version=VERSION,
13
+ author="Bence Kover",
14
+ author_email="<kover.bence@gmail.com>",
15
+ description=DESCRIPTION,
16
+ packages=find_packages(),
17
+ install_requires=[
18
+ 'xgboost',
19
+ 'scipy',
20
+ 'numpy'
21
+ ],
22
+ keywords=['xgboost', 'annotation', 'celltype', 'doublet'],
23
+ classifiers=[
24
+ "Development Status :: 1 - Planning",
25
+ "Intended Audience :: Science/Research",
26
+ "Programming Language :: Python :: 3",
27
+ "Operating System :: MacOS :: MacOS X"
28
+ ]
29
+ )
30
+