tcrsift 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tcrsift/__init__.py ADDED
@@ -0,0 +1,221 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ TCRsift: TCR selection from antigen-specific culture and scRNA/VDJ sequencing data.
15
+
16
+ A tool for identifying antigen-specific T cell receptor clones from single-cell
17
+ sequencing data, with support for:
18
+
19
+ - Loading CellRanger VDJ and GEX outputs
20
+ - CD4/CD8 T cell phenotyping from gene expression
21
+ - Clonotype aggregation and frequency analysis
22
+ - Tiered filtering for antigen-specific clones
23
+ - Annotation with public TCR databases (VDJdb, IEDB, CEDAR)
24
+ - TIL (tumor-infiltrating lymphocyte) matching
25
+ - Full-length TCR sequence assembly
26
+
27
+ Example usage::
28
+
29
+ # Run complete pipeline
30
+ tcrsift run --sample-sheet samples.yaml --output-dir results/ --report
31
+
32
+ # Or run individual steps
33
+ tcrsift load --sample-sheet samples.yaml -o loaded.h5ad
34
+ tcrsift phenotype -i loaded.h5ad -o phenotyped.h5ad
35
+ tcrsift clonotype -i phenotyped.h5ad -o clonotypes.csv
36
+ tcrsift filter -i clonotypes.csv -o filtered/
37
+ tcrsift annotate -i filtered/tier4.csv -o annotated.csv --vdjdb /path/to/vdjdb
38
+ tcrsift assemble -i annotated.csv -o full_sequences.csv --include-constant
39
+
40
+ """
41
+
42
+ from .annotate import (
43
+ annotate_clonotypes,
44
+ get_annotation_summary,
45
+ load_cedar,
46
+ load_iedb,
47
+ load_vdjdb,
48
+ )
49
+ from .assemble import (
50
+ DEFAULT_LEADERS,
51
+ LINKERS,
52
+ assemble_full_sequences,
53
+ export_fasta,
54
+ translate_dna,
55
+ validate_sequences,
56
+ )
57
+ from .clonotype import (
58
+ aggregate_clonotypes,
59
+ export_clonotypes_airr,
60
+ get_clonotype_summary,
61
+ )
62
+ from .config import (
63
+ AssembleConfig,
64
+ GEXConfig,
65
+ LoadConfig,
66
+ SCTConfig,
67
+ TCRsiftConfig,
68
+ UnifyConfig,
69
+ )
70
+ from .filter import (
71
+ assign_tiers_threshold,
72
+ filter_clonotypes,
73
+ filter_clonotypes_threshold,
74
+ get_filter_summary,
75
+ split_by_tier,
76
+ )
77
+ from .gex import (
78
+ DEFAULT_GENE_GROUPS,
79
+ DEFAULT_GENE_LIST,
80
+ aggregate_gex_by_clonotype,
81
+ augment_with_gex,
82
+ compute_cd4_cd8_counts,
83
+ )
84
+ from .loader import (
85
+ load_cellranger_gex,
86
+ load_cellranger_vdj,
87
+ load_sample,
88
+ load_samples,
89
+ )
90
+ from .mnemonic import tcr_name
91
+ from .phenotype import (
92
+ classify_tcell_type,
93
+ filter_by_tcell_type,
94
+ get_phenotype_summary,
95
+ phenotype_cells,
96
+ )
97
+ from .plots import (
98
+ create_pipeline_funnel,
99
+ create_tcr_sequence_pdf,
100
+ plot_funnel,
101
+ )
102
+ from .qc import (
103
+ QCReport,
104
+ QCResult,
105
+ find_repeated_kmers,
106
+ get_qc_summary,
107
+ validate_clonotypes,
108
+ validate_sequence,
109
+ )
110
+
111
+ # Core modules
112
+ from .sample_sheet import (
113
+ Sample,
114
+ SampleSheet,
115
+ load_sample_sheet,
116
+ validate_sample_sheet,
117
+ )
118
+ from .sct import (
119
+ aggregate_sct,
120
+ get_sct_specificities,
121
+ load_sct,
122
+ )
123
+ from .til import (
124
+ get_til_summary,
125
+ identify_til_specific_clones,
126
+ match_til,
127
+ )
128
+ from .unify import (
129
+ add_phenotype_confidence,
130
+ compute_condition_statistics,
131
+ find_top_condition,
132
+ get_unify_summary,
133
+ merge_experiments,
134
+ )
135
+ from .validation import TCRsiftValidationError
136
+ from .version import __version__
137
+
138
+ __all__ = [
139
+ # Version
140
+ "__version__",
141
+ # Configuration
142
+ "TCRsiftConfig",
143
+ "LoadConfig",
144
+ "AssembleConfig",
145
+ "SCTConfig",
146
+ "GEXConfig",
147
+ "UnifyConfig",
148
+ # Sample sheet
149
+ "Sample",
150
+ "SampleSheet",
151
+ "load_sample_sheet",
152
+ "validate_sample_sheet",
153
+ # Loading
154
+ "load_cellranger_vdj",
155
+ "load_cellranger_gex",
156
+ "load_sample",
157
+ "load_samples",
158
+ # SCT (single-cell TCR platform)
159
+ "load_sct",
160
+ "aggregate_sct",
161
+ "get_sct_specificities",
162
+ # GEX
163
+ "augment_with_gex",
164
+ "aggregate_gex_by_clonotype",
165
+ "compute_cd4_cd8_counts",
166
+ "DEFAULT_GENE_LIST",
167
+ "DEFAULT_GENE_GROUPS",
168
+ # Phenotyping
169
+ "phenotype_cells",
170
+ "classify_tcell_type",
171
+ "filter_by_tcell_type",
172
+ "get_phenotype_summary",
173
+ # Clonotyping
174
+ "aggregate_clonotypes",
175
+ "get_clonotype_summary",
176
+ "export_clonotypes_airr",
177
+ # Filtering
178
+ "filter_clonotypes",
179
+ "filter_clonotypes_threshold",
180
+ "assign_tiers_threshold",
181
+ "split_by_tier",
182
+ "get_filter_summary",
183
+ # Annotation
184
+ "load_vdjdb",
185
+ "load_iedb",
186
+ "load_cedar",
187
+ "annotate_clonotypes",
188
+ "get_annotation_summary",
189
+ # TIL
190
+ "match_til",
191
+ "get_til_summary",
192
+ "identify_til_specific_clones",
193
+ # Unify
194
+ "merge_experiments",
195
+ "add_phenotype_confidence",
196
+ "compute_condition_statistics",
197
+ "find_top_condition",
198
+ "get_unify_summary",
199
+ # Assembly
200
+ "DEFAULT_LEADERS",
201
+ "LINKERS",
202
+ "assemble_full_sequences",
203
+ "translate_dna",
204
+ "validate_sequences",
205
+ "export_fasta",
206
+ # Plots
207
+ "plot_funnel",
208
+ "create_pipeline_funnel",
209
+ "create_tcr_sequence_pdf",
210
+ # QC
211
+ "QCReport",
212
+ "QCResult",
213
+ "find_repeated_kmers",
214
+ "validate_sequence",
215
+ "validate_clonotypes",
216
+ "get_qc_summary",
217
+ # Utilities
218
+ "tcr_name",
219
+ # Exceptions
220
+ "TCRsiftValidationError",
221
+ ]
tcrsift/annotate.py ADDED
@@ -0,0 +1,482 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ """
13
+ TCR annotation using public databases for TCRsift.
14
+
15
+ Matches TCRs against VDJdb, IEDB, and CEDAR to identify known specificities.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from pathlib import Path
21
+
22
+ import pandas as pd
23
+ from tqdm.auto import tqdm
24
+
25
+ from .validation import (
26
+ TCRsiftValidationError,
27
+ validate_clonotype_df,
28
+ validate_dataframe,
29
+ validate_file_exists,
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # Known viral species patterns for flagging
36
+ VIRAL_SPECIES_PATTERNS = [
37
+ "cmv", "cytomegalovirus",
38
+ "ebv", "epstein-barr",
39
+ "hiv", "human immunodeficiency",
40
+ "flu", "influenza",
41
+ "sars", "coronavirus",
42
+ "herpes", "hsv",
43
+ "hpv", "papilloma",
44
+ "hepatitis", "hbv", "hcv",
45
+ "dengue", "zika",
46
+ "yellow fever",
47
+ ]
48
+
49
+
50
+ def load_vdjdb(path: str | Path, verbose: bool = True) -> pd.DataFrame:
51
+ """
52
+ Load VDJdb database.
53
+
54
+ Parameters
55
+ ----------
56
+ path : str or Path
57
+ Path to VDJdb directory or file
58
+ verbose : bool
59
+ Print progress information
60
+
61
+ Returns
62
+ -------
63
+ pd.DataFrame
64
+ VDJdb entries with standardized columns
65
+ """
66
+ path = Path(path)
67
+
68
+ if path.is_dir():
69
+ # Look for the main database file
70
+ candidates = list(path.glob("vdjdb*.txt")) + list(path.glob("vdjdb*.tsv"))
71
+ if not candidates:
72
+ available = [f.name for f in path.iterdir()][:15]
73
+ raise TCRsiftValidationError(
74
+ f"No VDJdb files found in directory: {path}",
75
+ hint=f"Expected files matching 'vdjdb*.txt' or 'vdjdb*.tsv'. "
76
+ f"Available files: {available}",
77
+ )
78
+ db_file = candidates[0]
79
+ else:
80
+ db_file = validate_file_exists(path, "VDJdb database file")
81
+
82
+ if verbose:
83
+ logger.info(f"Loading VDJdb from {db_file}")
84
+
85
+ try:
86
+ df = pd.read_csv(db_file, sep="\t", low_memory=False)
87
+ except Exception as e:
88
+ raise TCRsiftValidationError(
89
+ f"Failed to read VDJdb file: {db_file}",
90
+ hint=f"Error: {e}. Make sure the file is a valid TSV file.",
91
+ )
92
+
93
+ if len(df) == 0:
94
+ raise TCRsiftValidationError(
95
+ f"VDJdb file is empty: {db_file}",
96
+ hint="Download a fresh copy from https://vdjdb.cdr3.net/",
97
+ )
98
+
99
+ # Standardize columns
100
+ column_mapping = {
101
+ "cdr3": "cdr3_beta",
102
+ "cdr3.alpha": "cdr3_alpha",
103
+ "antigen.epitope": "epitope",
104
+ "antigen.gene": "antigen_gene",
105
+ "antigen.species": "species",
106
+ "mhc.a": "mhc_allele",
107
+ "mhc.class": "mhc_class",
108
+ "reference.id": "reference",
109
+ }
110
+
111
+ for old, new in column_mapping.items():
112
+ if old in df.columns:
113
+ df[new] = df[old]
114
+
115
+ df["database"] = "VDJdb"
116
+
117
+ # Flag viral entries
118
+ df["is_viral"] = _flag_viral(df)
119
+
120
+ if verbose:
121
+ logger.info(f" Loaded {len(df):,} VDJdb entries ({df['is_viral'].sum():,} viral)")
122
+ return df
123
+
124
+
125
+ def load_iedb(path: str | Path) -> pd.DataFrame:
126
+ """
127
+ Load IEDB TCR database.
128
+
129
+ Parameters
130
+ ----------
131
+ path : str or Path
132
+ Path to IEDB file
133
+
134
+ Returns
135
+ -------
136
+ pd.DataFrame
137
+ IEDB entries with standardized columns
138
+ """
139
+ path = Path(path)
140
+ logger.info(f"Loading IEDB from {path}")
141
+
142
+ df = pd.read_csv(path, sep="\t", low_memory=False)
143
+
144
+ # Standardize columns (IEDB format varies)
145
+ # Common IEDB column names
146
+ column_mapping = {
147
+ "Chain 2 CDR3 Curated": "cdr3_beta",
148
+ "Chain 1 CDR3 Curated": "cdr3_alpha",
149
+ "Epitope - Name": "epitope",
150
+ "Epitope - Source Molecule Name": "antigen_gene",
151
+ "Epitope - Source Organism Name": "species",
152
+ "MHC Allele Names": "mhc_allele",
153
+ }
154
+
155
+ for old, new in column_mapping.items():
156
+ if old in df.columns:
157
+ df[new] = df[old]
158
+
159
+ df["database"] = "IEDB"
160
+ df["is_viral"] = _flag_viral(df)
161
+
162
+ logger.info(f"Loaded {len(df)} IEDB entries ({df['is_viral'].sum()} viral)")
163
+ return df
164
+
165
+
166
+ def load_cedar(path: str | Path) -> pd.DataFrame:
167
+ """
168
+ Load CEDAR TCR database.
169
+
170
+ Parameters
171
+ ----------
172
+ path : str or Path
173
+ Path to CEDAR file
174
+
175
+ Returns
176
+ -------
177
+ pd.DataFrame
178
+ CEDAR entries with standardized columns
179
+ """
180
+ path = Path(path)
181
+ logger.info(f"Loading CEDAR from {path}")
182
+
183
+ df = pd.read_csv(path, sep="\t", low_memory=False)
184
+
185
+ # Standardize columns
186
+ column_mapping = {
187
+ "cdr3_b_aa": "cdr3_beta",
188
+ "cdr3_a_aa": "cdr3_alpha",
189
+ "epitope_sequence": "epitope",
190
+ "antigen_name": "antigen_gene",
191
+ "organism": "species",
192
+ }
193
+
194
+ for old, new in column_mapping.items():
195
+ if old in df.columns:
196
+ df[new] = df[old]
197
+
198
+ df["database"] = "CEDAR"
199
+ df["is_viral"] = _flag_viral(df)
200
+
201
+ logger.info(f"Loaded {len(df)} CEDAR entries ({df['is_viral'].sum()} viral)")
202
+ return df
203
+
204
+
205
+ def _flag_viral(df: pd.DataFrame) -> pd.Series:
206
+ """Flag entries as viral based on species column."""
207
+ if "species" not in df.columns:
208
+ return pd.Series(False, index=df.index)
209
+
210
+ species_lower = df["species"].fillna("").str.lower()
211
+
212
+ is_viral = pd.Series(False, index=df.index)
213
+ for pattern in VIRAL_SPECIES_PATTERNS:
214
+ is_viral |= species_lower.str.contains(pattern, na=False)
215
+
216
+ return is_viral
217
+
218
+
219
+ def load_databases(
220
+ vdjdb_path: str | Path | None = None,
221
+ iedb_path: str | Path | None = None,
222
+ cedar_path: str | Path | None = None,
223
+ ) -> pd.DataFrame:
224
+ """
225
+ Load and combine multiple TCR databases.
226
+
227
+ Parameters
228
+ ----------
229
+ vdjdb_path : str or Path, optional
230
+ Path to VDJdb
231
+ iedb_path : str or Path, optional
232
+ Path to IEDB
233
+ cedar_path : str or Path, optional
234
+ Path to CEDAR
235
+
236
+ Returns
237
+ -------
238
+ pd.DataFrame
239
+ Combined database with standardized columns
240
+ """
241
+ dfs = []
242
+
243
+ if vdjdb_path:
244
+ dfs.append(load_vdjdb(vdjdb_path))
245
+ if iedb_path:
246
+ dfs.append(load_iedb(iedb_path))
247
+ if cedar_path:
248
+ dfs.append(load_cedar(cedar_path))
249
+
250
+ if not dfs:
251
+ raise ValueError("At least one database path must be provided")
252
+
253
+ # Combine and deduplicate
254
+ combined = pd.concat(dfs, ignore_index=True)
255
+
256
+ # Keep only rows with at least a beta CDR3
257
+ combined = combined[combined["cdr3_beta"].notna() & (combined["cdr3_beta"] != "")]
258
+
259
+ logger.info(f"Combined database has {len(combined)} entries")
260
+ return combined
261
+
262
+
263
+ def match_clonotypes(
264
+ clonotypes: pd.DataFrame,
265
+ database: pd.DataFrame,
266
+ match_by: str = "CDR3ab",
267
+ verbose: bool = True,
268
+ show_progress: bool = True,
269
+ ) -> pd.DataFrame:
270
+ """
271
+ Match clonotypes against public database.
272
+
273
+ Parameters
274
+ ----------
275
+ clonotypes : pd.DataFrame
276
+ Clonotype DataFrame
277
+ database : pd.DataFrame
278
+ Combined database from load_databases
279
+ match_by : str
280
+ Matching strategy: "CDR3ab" (both chains) or "CDR3b_only" (beta only)
281
+ verbose : bool
282
+ Print progress information
283
+ show_progress : bool
284
+ Show progress bar
285
+
286
+ Returns
287
+ -------
288
+ pd.DataFrame
289
+ Clonotypes with match annotations added
290
+ """
291
+ # Validate inputs
292
+ clonotypes = validate_clonotype_df(clonotypes, for_annotation=True)
293
+ database = validate_dataframe(database, "database", min_rows=1)
294
+
295
+ valid_match_by = ["CDR3ab", "CDR3b_only"]
296
+ if match_by not in valid_match_by:
297
+ raise TCRsiftValidationError(
298
+ f"Invalid match_by: '{match_by}'",
299
+ hint=f"Valid options are: {valid_match_by}",
300
+ )
301
+
302
+ if verbose:
303
+ logger.info(f"Matching {len(clonotypes):,} clonotypes against {len(database):,} database entries by {match_by}")
304
+
305
+ df = clonotypes.copy()
306
+
307
+ # Initialize annotation columns
308
+ df["db_match"] = False
309
+ df["db_epitope"] = None
310
+ df["db_species"] = None
311
+ df["db_database"] = None
312
+ df["is_viral"] = False
313
+
314
+ # Build lookup sets for fast matching
315
+ if match_by == "CDR3ab":
316
+ # Match on both alpha and beta
317
+ db_alpha_beta = set(
318
+ zip(
319
+ database["cdr3_alpha"].fillna(""),
320
+ database["cdr3_beta"].fillna("")
321
+ )
322
+ )
323
+
324
+ # Create iterator with optional progress bar
325
+ row_iter = df.iterrows()
326
+ if show_progress:
327
+ row_iter = tqdm(
328
+ list(df.iterrows()),
329
+ desc="Matching clonotypes",
330
+ unit="clone",
331
+ )
332
+
333
+ for idx, row in row_iter:
334
+ alpha = row.get("CDR3_alpha", "") or ""
335
+ beta = row.get("CDR3_beta", "") or ""
336
+
337
+ if (alpha, beta) in db_alpha_beta:
338
+ matches = database[
339
+ (database["cdr3_alpha"] == alpha) &
340
+ (database["cdr3_beta"] == beta)
341
+ ]
342
+ _annotate_match(df, idx, matches)
343
+
344
+ # Also try beta-only match as fallback
345
+ elif beta and beta in database["cdr3_beta"].values:
346
+ matches = database[database["cdr3_beta"] == beta]
347
+ _annotate_match(df, idx, matches, partial=True)
348
+
349
+ else: # CDR3b_only
350
+ db_beta_set = set(database["cdr3_beta"].dropna())
351
+
352
+ # Create iterator with optional progress bar
353
+ row_iter = df.iterrows()
354
+ if show_progress:
355
+ row_iter = tqdm(
356
+ list(df.iterrows()),
357
+ desc="Matching clonotypes",
358
+ unit="clone",
359
+ )
360
+
361
+ for idx, row in row_iter:
362
+ beta = row.get("CDR3_beta", "") or ""
363
+ if beta in db_beta_set:
364
+ matches = database[database["cdr3_beta"] == beta]
365
+ _annotate_match(df, idx, matches)
366
+
367
+ n_matches = df["db_match"].sum()
368
+ n_viral = df["is_viral"].sum()
369
+ if verbose:
370
+ logger.info(f" Found {n_matches:,} matches ({n_viral:,} viral)")
371
+
372
+ return df
373
+
374
+
375
+ def _annotate_match(
376
+ df: pd.DataFrame,
377
+ idx: int,
378
+ matches: pd.DataFrame,
379
+ partial: bool = False,
380
+ ):
381
+ """Annotate a single clonotype with match information."""
382
+ if len(matches) == 0:
383
+ return
384
+
385
+ df.loc[idx, "db_match"] = True
386
+
387
+ # Take most common epitope
388
+ epitopes = matches["epitope"].dropna()
389
+ if len(epitopes) > 0:
390
+ df.loc[idx, "db_epitope"] = epitopes.mode().iloc[0]
391
+
392
+ # Take most common species
393
+ species = matches["species"].dropna()
394
+ if len(species) > 0:
395
+ df.loc[idx, "db_species"] = species.mode().iloc[0]
396
+
397
+ # Record database sources
398
+ df.loc[idx, "db_database"] = ";".join(matches["database"].unique())
399
+
400
+ # Viral flag
401
+ df.loc[idx, "is_viral"] = matches["is_viral"].any()
402
+
403
+ # Partial match flag
404
+ if partial:
405
+ df.loc[idx, "db_match_partial"] = True
406
+
407
+
408
+ def annotate_clonotypes(
409
+ clonotypes: pd.DataFrame,
410
+ vdjdb_path: str | Path | None = None,
411
+ iedb_path: str | Path | None = None,
412
+ cedar_path: str | Path | None = None,
413
+ match_by: str = "CDR3ab",
414
+ exclude_viral: bool = False,
415
+ flag_only: bool = False,
416
+ ) -> pd.DataFrame:
417
+ """
418
+ Main annotation function.
419
+
420
+ Parameters
421
+ ----------
422
+ clonotypes : pd.DataFrame
423
+ Clonotype DataFrame
424
+ vdjdb_path, iedb_path, cedar_path : str or Path, optional
425
+ Paths to databases
426
+ match_by : str
427
+ Matching strategy
428
+ exclude_viral : bool
429
+ Remove clones matching viral epitopes
430
+ flag_only : bool
431
+ Just flag viral, don't remove
432
+
433
+ Returns
434
+ -------
435
+ pd.DataFrame
436
+ Annotated clonotypes
437
+ """
438
+ # Load databases
439
+ database = load_databases(
440
+ vdjdb_path=vdjdb_path,
441
+ iedb_path=iedb_path,
442
+ cedar_path=cedar_path,
443
+ )
444
+
445
+ # Match clonotypes
446
+ df = match_clonotypes(clonotypes, database, match_by=match_by)
447
+
448
+ # Handle viral exclusion
449
+ if exclude_viral and not flag_only:
450
+ initial = len(df)
451
+ df = df[~df["is_viral"]]
452
+ logger.info(f"Excluded {initial - len(df)} viral clones")
453
+
454
+ return df
455
+
456
+
457
+ def get_annotation_summary(clonotypes: pd.DataFrame) -> dict:
458
+ """
459
+ Get summary of annotation results.
460
+
461
+ Returns
462
+ -------
463
+ dict
464
+ Summary statistics
465
+ """
466
+ summary = {
467
+ "total": len(clonotypes),
468
+ "matched": clonotypes["db_match"].sum() if "db_match" in clonotypes.columns else 0,
469
+ "viral": clonotypes["is_viral"].sum() if "is_viral" in clonotypes.columns else 0,
470
+ }
471
+
472
+ if "db_database" in clonotypes.columns:
473
+ db_counts = {}
474
+ for db in ["VDJdb", "IEDB", "CEDAR"]:
475
+ db_counts[db] = clonotypes["db_database"].fillna("").str.contains(db).sum()
476
+ summary["database_breakdown"] = db_counts
477
+
478
+ if "db_species" in clonotypes.columns:
479
+ species_counts = clonotypes["db_species"].value_counts().head(10).to_dict()
480
+ summary["top_species"] = species_counts
481
+
482
+ return summary