tcrsift 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tcrsift/__init__.py +221 -0
- tcrsift/annotate.py +482 -0
- tcrsift/assemble.py +758 -0
- tcrsift/cli.py +1146 -0
- tcrsift/clonotype.py +409 -0
- tcrsift/config.py +472 -0
- tcrsift/data.py +218 -0
- tcrsift/filter.py +504 -0
- tcrsift/gex.py +373 -0
- tcrsift/loader.py +676 -0
- tcrsift/mnemonic.py +259 -0
- tcrsift/model.py +89 -0
- tcrsift/phenotype.py +353 -0
- tcrsift/plots.py +905 -0
- tcrsift/qc.py +527 -0
- tcrsift/sample_sheet.py +399 -0
- tcrsift/sct.py +342 -0
- tcrsift/til.py +236 -0
- tcrsift/unify.py +445 -0
- tcrsift/validation.py +543 -0
- tcrsift/version.py +24 -0
- tcrsift-0.2.0.dist-info/METADATA +786 -0
- tcrsift-0.2.0.dist-info/RECORD +27 -0
- tcrsift-0.2.0.dist-info/WHEEL +5 -0
- tcrsift-0.2.0.dist-info/entry_points.txt +2 -0
- tcrsift-0.2.0.dist-info/licenses/LICENSE +201 -0
- tcrsift-0.2.0.dist-info/top_level.txt +1 -0
tcrsift/__init__.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
TCRsift: TCR selection from antigen-specific culture and scRNA/VDJ sequencing data.
|
|
15
|
+
|
|
16
|
+
A tool for identifying antigen-specific T cell receptor clones from single-cell
|
|
17
|
+
sequencing data, with support for:
|
|
18
|
+
|
|
19
|
+
- Loading CellRanger VDJ and GEX outputs
|
|
20
|
+
- CD4/CD8 T cell phenotyping from gene expression
|
|
21
|
+
- Clonotype aggregation and frequency analysis
|
|
22
|
+
- Tiered filtering for antigen-specific clones
|
|
23
|
+
- Annotation with public TCR databases (VDJdb, IEDB, CEDAR)
|
|
24
|
+
- TIL (tumor-infiltrating lymphocyte) matching
|
|
25
|
+
- Full-length TCR sequence assembly
|
|
26
|
+
|
|
27
|
+
Example usage::
|
|
28
|
+
|
|
29
|
+
# Run complete pipeline
|
|
30
|
+
tcrsift run --sample-sheet samples.yaml --output-dir results/ --report
|
|
31
|
+
|
|
32
|
+
# Or run individual steps
|
|
33
|
+
tcrsift load --sample-sheet samples.yaml -o loaded.h5ad
|
|
34
|
+
tcrsift phenotype -i loaded.h5ad -o phenotyped.h5ad
|
|
35
|
+
tcrsift clonotype -i phenotyped.h5ad -o clonotypes.csv
|
|
36
|
+
tcrsift filter -i clonotypes.csv -o filtered/
|
|
37
|
+
tcrsift annotate -i filtered/tier4.csv -o annotated.csv --vdjdb /path/to/vdjdb
|
|
38
|
+
tcrsift assemble -i annotated.csv -o full_sequences.csv --include-constant
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from .annotate import (
|
|
43
|
+
annotate_clonotypes,
|
|
44
|
+
get_annotation_summary,
|
|
45
|
+
load_cedar,
|
|
46
|
+
load_iedb,
|
|
47
|
+
load_vdjdb,
|
|
48
|
+
)
|
|
49
|
+
from .assemble import (
|
|
50
|
+
DEFAULT_LEADERS,
|
|
51
|
+
LINKERS,
|
|
52
|
+
assemble_full_sequences,
|
|
53
|
+
export_fasta,
|
|
54
|
+
translate_dna,
|
|
55
|
+
validate_sequences,
|
|
56
|
+
)
|
|
57
|
+
from .clonotype import (
|
|
58
|
+
aggregate_clonotypes,
|
|
59
|
+
export_clonotypes_airr,
|
|
60
|
+
get_clonotype_summary,
|
|
61
|
+
)
|
|
62
|
+
from .config import (
|
|
63
|
+
AssembleConfig,
|
|
64
|
+
GEXConfig,
|
|
65
|
+
LoadConfig,
|
|
66
|
+
SCTConfig,
|
|
67
|
+
TCRsiftConfig,
|
|
68
|
+
UnifyConfig,
|
|
69
|
+
)
|
|
70
|
+
from .filter import (
|
|
71
|
+
assign_tiers_threshold,
|
|
72
|
+
filter_clonotypes,
|
|
73
|
+
filter_clonotypes_threshold,
|
|
74
|
+
get_filter_summary,
|
|
75
|
+
split_by_tier,
|
|
76
|
+
)
|
|
77
|
+
from .gex import (
|
|
78
|
+
DEFAULT_GENE_GROUPS,
|
|
79
|
+
DEFAULT_GENE_LIST,
|
|
80
|
+
aggregate_gex_by_clonotype,
|
|
81
|
+
augment_with_gex,
|
|
82
|
+
compute_cd4_cd8_counts,
|
|
83
|
+
)
|
|
84
|
+
from .loader import (
|
|
85
|
+
load_cellranger_gex,
|
|
86
|
+
load_cellranger_vdj,
|
|
87
|
+
load_sample,
|
|
88
|
+
load_samples,
|
|
89
|
+
)
|
|
90
|
+
from .mnemonic import tcr_name
|
|
91
|
+
from .phenotype import (
|
|
92
|
+
classify_tcell_type,
|
|
93
|
+
filter_by_tcell_type,
|
|
94
|
+
get_phenotype_summary,
|
|
95
|
+
phenotype_cells,
|
|
96
|
+
)
|
|
97
|
+
from .plots import (
|
|
98
|
+
create_pipeline_funnel,
|
|
99
|
+
create_tcr_sequence_pdf,
|
|
100
|
+
plot_funnel,
|
|
101
|
+
)
|
|
102
|
+
from .qc import (
|
|
103
|
+
QCReport,
|
|
104
|
+
QCResult,
|
|
105
|
+
find_repeated_kmers,
|
|
106
|
+
get_qc_summary,
|
|
107
|
+
validate_clonotypes,
|
|
108
|
+
validate_sequence,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Core modules
|
|
112
|
+
from .sample_sheet import (
|
|
113
|
+
Sample,
|
|
114
|
+
SampleSheet,
|
|
115
|
+
load_sample_sheet,
|
|
116
|
+
validate_sample_sheet,
|
|
117
|
+
)
|
|
118
|
+
from .sct import (
|
|
119
|
+
aggregate_sct,
|
|
120
|
+
get_sct_specificities,
|
|
121
|
+
load_sct,
|
|
122
|
+
)
|
|
123
|
+
from .til import (
|
|
124
|
+
get_til_summary,
|
|
125
|
+
identify_til_specific_clones,
|
|
126
|
+
match_til,
|
|
127
|
+
)
|
|
128
|
+
from .unify import (
|
|
129
|
+
add_phenotype_confidence,
|
|
130
|
+
compute_condition_statistics,
|
|
131
|
+
find_top_condition,
|
|
132
|
+
get_unify_summary,
|
|
133
|
+
merge_experiments,
|
|
134
|
+
)
|
|
135
|
+
from .validation import TCRsiftValidationError
|
|
136
|
+
from .version import __version__
|
|
137
|
+
|
|
138
|
+
__all__ = [
|
|
139
|
+
# Version
|
|
140
|
+
"__version__",
|
|
141
|
+
# Configuration
|
|
142
|
+
"TCRsiftConfig",
|
|
143
|
+
"LoadConfig",
|
|
144
|
+
"AssembleConfig",
|
|
145
|
+
"SCTConfig",
|
|
146
|
+
"GEXConfig",
|
|
147
|
+
"UnifyConfig",
|
|
148
|
+
# Sample sheet
|
|
149
|
+
"Sample",
|
|
150
|
+
"SampleSheet",
|
|
151
|
+
"load_sample_sheet",
|
|
152
|
+
"validate_sample_sheet",
|
|
153
|
+
# Loading
|
|
154
|
+
"load_cellranger_vdj",
|
|
155
|
+
"load_cellranger_gex",
|
|
156
|
+
"load_sample",
|
|
157
|
+
"load_samples",
|
|
158
|
+
# SCT (single-cell TCR platform)
|
|
159
|
+
"load_sct",
|
|
160
|
+
"aggregate_sct",
|
|
161
|
+
"get_sct_specificities",
|
|
162
|
+
# GEX
|
|
163
|
+
"augment_with_gex",
|
|
164
|
+
"aggregate_gex_by_clonotype",
|
|
165
|
+
"compute_cd4_cd8_counts",
|
|
166
|
+
"DEFAULT_GENE_LIST",
|
|
167
|
+
"DEFAULT_GENE_GROUPS",
|
|
168
|
+
# Phenotyping
|
|
169
|
+
"phenotype_cells",
|
|
170
|
+
"classify_tcell_type",
|
|
171
|
+
"filter_by_tcell_type",
|
|
172
|
+
"get_phenotype_summary",
|
|
173
|
+
# Clonotyping
|
|
174
|
+
"aggregate_clonotypes",
|
|
175
|
+
"get_clonotype_summary",
|
|
176
|
+
"export_clonotypes_airr",
|
|
177
|
+
# Filtering
|
|
178
|
+
"filter_clonotypes",
|
|
179
|
+
"filter_clonotypes_threshold",
|
|
180
|
+
"assign_tiers_threshold",
|
|
181
|
+
"split_by_tier",
|
|
182
|
+
"get_filter_summary",
|
|
183
|
+
# Annotation
|
|
184
|
+
"load_vdjdb",
|
|
185
|
+
"load_iedb",
|
|
186
|
+
"load_cedar",
|
|
187
|
+
"annotate_clonotypes",
|
|
188
|
+
"get_annotation_summary",
|
|
189
|
+
# TIL
|
|
190
|
+
"match_til",
|
|
191
|
+
"get_til_summary",
|
|
192
|
+
"identify_til_specific_clones",
|
|
193
|
+
# Unify
|
|
194
|
+
"merge_experiments",
|
|
195
|
+
"add_phenotype_confidence",
|
|
196
|
+
"compute_condition_statistics",
|
|
197
|
+
"find_top_condition",
|
|
198
|
+
"get_unify_summary",
|
|
199
|
+
# Assembly
|
|
200
|
+
"DEFAULT_LEADERS",
|
|
201
|
+
"LINKERS",
|
|
202
|
+
"assemble_full_sequences",
|
|
203
|
+
"translate_dna",
|
|
204
|
+
"validate_sequences",
|
|
205
|
+
"export_fasta",
|
|
206
|
+
# Plots
|
|
207
|
+
"plot_funnel",
|
|
208
|
+
"create_pipeline_funnel",
|
|
209
|
+
"create_tcr_sequence_pdf",
|
|
210
|
+
# QC
|
|
211
|
+
"QCReport",
|
|
212
|
+
"QCResult",
|
|
213
|
+
"find_repeated_kmers",
|
|
214
|
+
"validate_sequence",
|
|
215
|
+
"validate_clonotypes",
|
|
216
|
+
"get_qc_summary",
|
|
217
|
+
# Utilities
|
|
218
|
+
"tcr_name",
|
|
219
|
+
# Exceptions
|
|
220
|
+
"TCRsiftValidationError",
|
|
221
|
+
]
|
tcrsift/annotate.py
ADDED
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
"""
|
|
13
|
+
TCR annotation using public databases for TCRsift.
|
|
14
|
+
|
|
15
|
+
Matches TCRs against VDJdb, IEDB, and CEDAR to identify known specificities.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
from tqdm.auto import tqdm
|
|
24
|
+
|
|
25
|
+
from .validation import (
|
|
26
|
+
TCRsiftValidationError,
|
|
27
|
+
validate_clonotype_df,
|
|
28
|
+
validate_dataframe,
|
|
29
|
+
validate_file_exists,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Known viral species patterns for flagging
|
|
36
|
+
VIRAL_SPECIES_PATTERNS = [
|
|
37
|
+
"cmv", "cytomegalovirus",
|
|
38
|
+
"ebv", "epstein-barr",
|
|
39
|
+
"hiv", "human immunodeficiency",
|
|
40
|
+
"flu", "influenza",
|
|
41
|
+
"sars", "coronavirus",
|
|
42
|
+
"herpes", "hsv",
|
|
43
|
+
"hpv", "papilloma",
|
|
44
|
+
"hepatitis", "hbv", "hcv",
|
|
45
|
+
"dengue", "zika",
|
|
46
|
+
"yellow fever",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_vdjdb(path: str | Path, verbose: bool = True) -> pd.DataFrame:
|
|
51
|
+
"""
|
|
52
|
+
Load VDJdb database.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
path : str or Path
|
|
57
|
+
Path to VDJdb directory or file
|
|
58
|
+
verbose : bool
|
|
59
|
+
Print progress information
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
pd.DataFrame
|
|
64
|
+
VDJdb entries with standardized columns
|
|
65
|
+
"""
|
|
66
|
+
path = Path(path)
|
|
67
|
+
|
|
68
|
+
if path.is_dir():
|
|
69
|
+
# Look for the main database file
|
|
70
|
+
candidates = list(path.glob("vdjdb*.txt")) + list(path.glob("vdjdb*.tsv"))
|
|
71
|
+
if not candidates:
|
|
72
|
+
available = [f.name for f in path.iterdir()][:15]
|
|
73
|
+
raise TCRsiftValidationError(
|
|
74
|
+
f"No VDJdb files found in directory: {path}",
|
|
75
|
+
hint=f"Expected files matching 'vdjdb*.txt' or 'vdjdb*.tsv'. "
|
|
76
|
+
f"Available files: {available}",
|
|
77
|
+
)
|
|
78
|
+
db_file = candidates[0]
|
|
79
|
+
else:
|
|
80
|
+
db_file = validate_file_exists(path, "VDJdb database file")
|
|
81
|
+
|
|
82
|
+
if verbose:
|
|
83
|
+
logger.info(f"Loading VDJdb from {db_file}")
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
df = pd.read_csv(db_file, sep="\t", low_memory=False)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise TCRsiftValidationError(
|
|
89
|
+
f"Failed to read VDJdb file: {db_file}",
|
|
90
|
+
hint=f"Error: {e}. Make sure the file is a valid TSV file.",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if len(df) == 0:
|
|
94
|
+
raise TCRsiftValidationError(
|
|
95
|
+
f"VDJdb file is empty: {db_file}",
|
|
96
|
+
hint="Download a fresh copy from https://vdjdb.cdr3.net/",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Standardize columns
|
|
100
|
+
column_mapping = {
|
|
101
|
+
"cdr3": "cdr3_beta",
|
|
102
|
+
"cdr3.alpha": "cdr3_alpha",
|
|
103
|
+
"antigen.epitope": "epitope",
|
|
104
|
+
"antigen.gene": "antigen_gene",
|
|
105
|
+
"antigen.species": "species",
|
|
106
|
+
"mhc.a": "mhc_allele",
|
|
107
|
+
"mhc.class": "mhc_class",
|
|
108
|
+
"reference.id": "reference",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
for old, new in column_mapping.items():
|
|
112
|
+
if old in df.columns:
|
|
113
|
+
df[new] = df[old]
|
|
114
|
+
|
|
115
|
+
df["database"] = "VDJdb"
|
|
116
|
+
|
|
117
|
+
# Flag viral entries
|
|
118
|
+
df["is_viral"] = _flag_viral(df)
|
|
119
|
+
|
|
120
|
+
if verbose:
|
|
121
|
+
logger.info(f" Loaded {len(df):,} VDJdb entries ({df['is_viral'].sum():,} viral)")
|
|
122
|
+
return df
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def load_iedb(path: str | Path) -> pd.DataFrame:
|
|
126
|
+
"""
|
|
127
|
+
Load IEDB TCR database.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
path : str or Path
|
|
132
|
+
Path to IEDB file
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
pd.DataFrame
|
|
137
|
+
IEDB entries with standardized columns
|
|
138
|
+
"""
|
|
139
|
+
path = Path(path)
|
|
140
|
+
logger.info(f"Loading IEDB from {path}")
|
|
141
|
+
|
|
142
|
+
df = pd.read_csv(path, sep="\t", low_memory=False)
|
|
143
|
+
|
|
144
|
+
# Standardize columns (IEDB format varies)
|
|
145
|
+
# Common IEDB column names
|
|
146
|
+
column_mapping = {
|
|
147
|
+
"Chain 2 CDR3 Curated": "cdr3_beta",
|
|
148
|
+
"Chain 1 CDR3 Curated": "cdr3_alpha",
|
|
149
|
+
"Epitope - Name": "epitope",
|
|
150
|
+
"Epitope - Source Molecule Name": "antigen_gene",
|
|
151
|
+
"Epitope - Source Organism Name": "species",
|
|
152
|
+
"MHC Allele Names": "mhc_allele",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
for old, new in column_mapping.items():
|
|
156
|
+
if old in df.columns:
|
|
157
|
+
df[new] = df[old]
|
|
158
|
+
|
|
159
|
+
df["database"] = "IEDB"
|
|
160
|
+
df["is_viral"] = _flag_viral(df)
|
|
161
|
+
|
|
162
|
+
logger.info(f"Loaded {len(df)} IEDB entries ({df['is_viral'].sum()} viral)")
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def load_cedar(path: str | Path) -> pd.DataFrame:
|
|
167
|
+
"""
|
|
168
|
+
Load CEDAR TCR database.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
path : str or Path
|
|
173
|
+
Path to CEDAR file
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
pd.DataFrame
|
|
178
|
+
CEDAR entries with standardized columns
|
|
179
|
+
"""
|
|
180
|
+
path = Path(path)
|
|
181
|
+
logger.info(f"Loading CEDAR from {path}")
|
|
182
|
+
|
|
183
|
+
df = pd.read_csv(path, sep="\t", low_memory=False)
|
|
184
|
+
|
|
185
|
+
# Standardize columns
|
|
186
|
+
column_mapping = {
|
|
187
|
+
"cdr3_b_aa": "cdr3_beta",
|
|
188
|
+
"cdr3_a_aa": "cdr3_alpha",
|
|
189
|
+
"epitope_sequence": "epitope",
|
|
190
|
+
"antigen_name": "antigen_gene",
|
|
191
|
+
"organism": "species",
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
for old, new in column_mapping.items():
|
|
195
|
+
if old in df.columns:
|
|
196
|
+
df[new] = df[old]
|
|
197
|
+
|
|
198
|
+
df["database"] = "CEDAR"
|
|
199
|
+
df["is_viral"] = _flag_viral(df)
|
|
200
|
+
|
|
201
|
+
logger.info(f"Loaded {len(df)} CEDAR entries ({df['is_viral'].sum()} viral)")
|
|
202
|
+
return df
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _flag_viral(df: pd.DataFrame) -> pd.Series:
|
|
206
|
+
"""Flag entries as viral based on species column."""
|
|
207
|
+
if "species" not in df.columns:
|
|
208
|
+
return pd.Series(False, index=df.index)
|
|
209
|
+
|
|
210
|
+
species_lower = df["species"].fillna("").str.lower()
|
|
211
|
+
|
|
212
|
+
is_viral = pd.Series(False, index=df.index)
|
|
213
|
+
for pattern in VIRAL_SPECIES_PATTERNS:
|
|
214
|
+
is_viral |= species_lower.str.contains(pattern, na=False)
|
|
215
|
+
|
|
216
|
+
return is_viral
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def load_databases(
|
|
220
|
+
vdjdb_path: str | Path | None = None,
|
|
221
|
+
iedb_path: str | Path | None = None,
|
|
222
|
+
cedar_path: str | Path | None = None,
|
|
223
|
+
) -> pd.DataFrame:
|
|
224
|
+
"""
|
|
225
|
+
Load and combine multiple TCR databases.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
vdjdb_path : str or Path, optional
|
|
230
|
+
Path to VDJdb
|
|
231
|
+
iedb_path : str or Path, optional
|
|
232
|
+
Path to IEDB
|
|
233
|
+
cedar_path : str or Path, optional
|
|
234
|
+
Path to CEDAR
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
pd.DataFrame
|
|
239
|
+
Combined database with standardized columns
|
|
240
|
+
"""
|
|
241
|
+
dfs = []
|
|
242
|
+
|
|
243
|
+
if vdjdb_path:
|
|
244
|
+
dfs.append(load_vdjdb(vdjdb_path))
|
|
245
|
+
if iedb_path:
|
|
246
|
+
dfs.append(load_iedb(iedb_path))
|
|
247
|
+
if cedar_path:
|
|
248
|
+
dfs.append(load_cedar(cedar_path))
|
|
249
|
+
|
|
250
|
+
if not dfs:
|
|
251
|
+
raise ValueError("At least one database path must be provided")
|
|
252
|
+
|
|
253
|
+
# Combine and deduplicate
|
|
254
|
+
combined = pd.concat(dfs, ignore_index=True)
|
|
255
|
+
|
|
256
|
+
# Keep only rows with at least a beta CDR3
|
|
257
|
+
combined = combined[combined["cdr3_beta"].notna() & (combined["cdr3_beta"] != "")]
|
|
258
|
+
|
|
259
|
+
logger.info(f"Combined database has {len(combined)} entries")
|
|
260
|
+
return combined
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def match_clonotypes(
|
|
264
|
+
clonotypes: pd.DataFrame,
|
|
265
|
+
database: pd.DataFrame,
|
|
266
|
+
match_by: str = "CDR3ab",
|
|
267
|
+
verbose: bool = True,
|
|
268
|
+
show_progress: bool = True,
|
|
269
|
+
) -> pd.DataFrame:
|
|
270
|
+
"""
|
|
271
|
+
Match clonotypes against public database.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
clonotypes : pd.DataFrame
|
|
276
|
+
Clonotype DataFrame
|
|
277
|
+
database : pd.DataFrame
|
|
278
|
+
Combined database from load_databases
|
|
279
|
+
match_by : str
|
|
280
|
+
Matching strategy: "CDR3ab" (both chains) or "CDR3b_only" (beta only)
|
|
281
|
+
verbose : bool
|
|
282
|
+
Print progress information
|
|
283
|
+
show_progress : bool
|
|
284
|
+
Show progress bar
|
|
285
|
+
|
|
286
|
+
Returns
|
|
287
|
+
-------
|
|
288
|
+
pd.DataFrame
|
|
289
|
+
Clonotypes with match annotations added
|
|
290
|
+
"""
|
|
291
|
+
# Validate inputs
|
|
292
|
+
clonotypes = validate_clonotype_df(clonotypes, for_annotation=True)
|
|
293
|
+
database = validate_dataframe(database, "database", min_rows=1)
|
|
294
|
+
|
|
295
|
+
valid_match_by = ["CDR3ab", "CDR3b_only"]
|
|
296
|
+
if match_by not in valid_match_by:
|
|
297
|
+
raise TCRsiftValidationError(
|
|
298
|
+
f"Invalid match_by: '{match_by}'",
|
|
299
|
+
hint=f"Valid options are: {valid_match_by}",
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if verbose:
|
|
303
|
+
logger.info(f"Matching {len(clonotypes):,} clonotypes against {len(database):,} database entries by {match_by}")
|
|
304
|
+
|
|
305
|
+
df = clonotypes.copy()
|
|
306
|
+
|
|
307
|
+
# Initialize annotation columns
|
|
308
|
+
df["db_match"] = False
|
|
309
|
+
df["db_epitope"] = None
|
|
310
|
+
df["db_species"] = None
|
|
311
|
+
df["db_database"] = None
|
|
312
|
+
df["is_viral"] = False
|
|
313
|
+
|
|
314
|
+
# Build lookup sets for fast matching
|
|
315
|
+
if match_by == "CDR3ab":
|
|
316
|
+
# Match on both alpha and beta
|
|
317
|
+
db_alpha_beta = set(
|
|
318
|
+
zip(
|
|
319
|
+
database["cdr3_alpha"].fillna(""),
|
|
320
|
+
database["cdr3_beta"].fillna("")
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Create iterator with optional progress bar
|
|
325
|
+
row_iter = df.iterrows()
|
|
326
|
+
if show_progress:
|
|
327
|
+
row_iter = tqdm(
|
|
328
|
+
list(df.iterrows()),
|
|
329
|
+
desc="Matching clonotypes",
|
|
330
|
+
unit="clone",
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
for idx, row in row_iter:
|
|
334
|
+
alpha = row.get("CDR3_alpha", "") or ""
|
|
335
|
+
beta = row.get("CDR3_beta", "") or ""
|
|
336
|
+
|
|
337
|
+
if (alpha, beta) in db_alpha_beta:
|
|
338
|
+
matches = database[
|
|
339
|
+
(database["cdr3_alpha"] == alpha) &
|
|
340
|
+
(database["cdr3_beta"] == beta)
|
|
341
|
+
]
|
|
342
|
+
_annotate_match(df, idx, matches)
|
|
343
|
+
|
|
344
|
+
# Also try beta-only match as fallback
|
|
345
|
+
elif beta and beta in database["cdr3_beta"].values:
|
|
346
|
+
matches = database[database["cdr3_beta"] == beta]
|
|
347
|
+
_annotate_match(df, idx, matches, partial=True)
|
|
348
|
+
|
|
349
|
+
else: # CDR3b_only
|
|
350
|
+
db_beta_set = set(database["cdr3_beta"].dropna())
|
|
351
|
+
|
|
352
|
+
# Create iterator with optional progress bar
|
|
353
|
+
row_iter = df.iterrows()
|
|
354
|
+
if show_progress:
|
|
355
|
+
row_iter = tqdm(
|
|
356
|
+
list(df.iterrows()),
|
|
357
|
+
desc="Matching clonotypes",
|
|
358
|
+
unit="clone",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
for idx, row in row_iter:
|
|
362
|
+
beta = row.get("CDR3_beta", "") or ""
|
|
363
|
+
if beta in db_beta_set:
|
|
364
|
+
matches = database[database["cdr3_beta"] == beta]
|
|
365
|
+
_annotate_match(df, idx, matches)
|
|
366
|
+
|
|
367
|
+
n_matches = df["db_match"].sum()
|
|
368
|
+
n_viral = df["is_viral"].sum()
|
|
369
|
+
if verbose:
|
|
370
|
+
logger.info(f" Found {n_matches:,} matches ({n_viral:,} viral)")
|
|
371
|
+
|
|
372
|
+
return df
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _annotate_match(
|
|
376
|
+
df: pd.DataFrame,
|
|
377
|
+
idx: int,
|
|
378
|
+
matches: pd.DataFrame,
|
|
379
|
+
partial: bool = False,
|
|
380
|
+
):
|
|
381
|
+
"""Annotate a single clonotype with match information."""
|
|
382
|
+
if len(matches) == 0:
|
|
383
|
+
return
|
|
384
|
+
|
|
385
|
+
df.loc[idx, "db_match"] = True
|
|
386
|
+
|
|
387
|
+
# Take most common epitope
|
|
388
|
+
epitopes = matches["epitope"].dropna()
|
|
389
|
+
if len(epitopes) > 0:
|
|
390
|
+
df.loc[idx, "db_epitope"] = epitopes.mode().iloc[0]
|
|
391
|
+
|
|
392
|
+
# Take most common species
|
|
393
|
+
species = matches["species"].dropna()
|
|
394
|
+
if len(species) > 0:
|
|
395
|
+
df.loc[idx, "db_species"] = species.mode().iloc[0]
|
|
396
|
+
|
|
397
|
+
# Record database sources
|
|
398
|
+
df.loc[idx, "db_database"] = ";".join(matches["database"].unique())
|
|
399
|
+
|
|
400
|
+
# Viral flag
|
|
401
|
+
df.loc[idx, "is_viral"] = matches["is_viral"].any()
|
|
402
|
+
|
|
403
|
+
# Partial match flag
|
|
404
|
+
if partial:
|
|
405
|
+
df.loc[idx, "db_match_partial"] = True
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def annotate_clonotypes(
|
|
409
|
+
clonotypes: pd.DataFrame,
|
|
410
|
+
vdjdb_path: str | Path | None = None,
|
|
411
|
+
iedb_path: str | Path | None = None,
|
|
412
|
+
cedar_path: str | Path | None = None,
|
|
413
|
+
match_by: str = "CDR3ab",
|
|
414
|
+
exclude_viral: bool = False,
|
|
415
|
+
flag_only: bool = False,
|
|
416
|
+
) -> pd.DataFrame:
|
|
417
|
+
"""
|
|
418
|
+
Main annotation function.
|
|
419
|
+
|
|
420
|
+
Parameters
|
|
421
|
+
----------
|
|
422
|
+
clonotypes : pd.DataFrame
|
|
423
|
+
Clonotype DataFrame
|
|
424
|
+
vdjdb_path, iedb_path, cedar_path : str or Path, optional
|
|
425
|
+
Paths to databases
|
|
426
|
+
match_by : str
|
|
427
|
+
Matching strategy
|
|
428
|
+
exclude_viral : bool
|
|
429
|
+
Remove clones matching viral epitopes
|
|
430
|
+
flag_only : bool
|
|
431
|
+
Just flag viral, don't remove
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
pd.DataFrame
|
|
436
|
+
Annotated clonotypes
|
|
437
|
+
"""
|
|
438
|
+
# Load databases
|
|
439
|
+
database = load_databases(
|
|
440
|
+
vdjdb_path=vdjdb_path,
|
|
441
|
+
iedb_path=iedb_path,
|
|
442
|
+
cedar_path=cedar_path,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Match clonotypes
|
|
446
|
+
df = match_clonotypes(clonotypes, database, match_by=match_by)
|
|
447
|
+
|
|
448
|
+
# Handle viral exclusion
|
|
449
|
+
if exclude_viral and not flag_only:
|
|
450
|
+
initial = len(df)
|
|
451
|
+
df = df[~df["is_viral"]]
|
|
452
|
+
logger.info(f"Excluded {initial - len(df)} viral clones")
|
|
453
|
+
|
|
454
|
+
return df
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def get_annotation_summary(clonotypes: pd.DataFrame) -> dict:
|
|
458
|
+
"""
|
|
459
|
+
Get summary of annotation results.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
dict
|
|
464
|
+
Summary statistics
|
|
465
|
+
"""
|
|
466
|
+
summary = {
|
|
467
|
+
"total": len(clonotypes),
|
|
468
|
+
"matched": clonotypes["db_match"].sum() if "db_match" in clonotypes.columns else 0,
|
|
469
|
+
"viral": clonotypes["is_viral"].sum() if "is_viral" in clonotypes.columns else 0,
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if "db_database" in clonotypes.columns:
|
|
473
|
+
db_counts = {}
|
|
474
|
+
for db in ["VDJdb", "IEDB", "CEDAR"]:
|
|
475
|
+
db_counts[db] = clonotypes["db_database"].fillna("").str.contains(db).sum()
|
|
476
|
+
summary["database_breakdown"] = db_counts
|
|
477
|
+
|
|
478
|
+
if "db_species" in clonotypes.columns:
|
|
479
|
+
species_counts = clonotypes["db_species"].value_counts().head(10).to_dict()
|
|
480
|
+
summary["top_species"] = species_counts
|
|
481
|
+
|
|
482
|
+
return summary
|