autoDCR 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autodcr-0.3.0/PKG-INFO +33 -0
- autodcr-0.3.0/README.md +11 -0
- autodcr-0.3.0/pyproject.toml +29 -0
- autodcr-0.3.0/src/autoDCRdata/__init__.py +0 -0
- autodcr-0.3.0/src/autoDCRdata/regexes.json +1 -0
- autodcr-0.3.0/src/autoDCRscripts/__init__.py +7 -0
- autodcr-0.3.0/src/autoDCRscripts/annotate.py +132 -0
- autodcr-0.3.0/src/autoDCRscripts/autoDCRfunctions.py +1235 -0
- autodcr-0.3.0/src/autoDCRscripts/commandline.py +137 -0
- autodcr-0.3.0/src/autoDCRscripts/discover.py +192 -0
- autodcr-0.3.0/src/autoDCRscripts/genotype.py +54 -0
- autodcr-0.3.0/src/autoDCRscripts/main.py +182 -0
- autodcr-0.3.0/src/autoDCRscripts/referencedata.py +654 -0
autodcr-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: autoDCR
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Automatic DCR: a modified version of Decombinator for uncommon specific TCR annotation-related tasks
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Jamie Heather
|
|
7
|
+
Author-email: jheather@mgh.harvard.edu
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: acora (>=2.4)
|
|
16
|
+
Requires-Dist: imgtgenedl (>=0.6.1)
|
|
17
|
+
Requires-Dist: matplotlib (>=3.8.0)
|
|
18
|
+
Requires-Dist: pandas (>=2.0.0)
|
|
19
|
+
Requires-Dist: seaborn (>=0.13.0)
|
|
20
|
+
Requires-Dist: typer (>=0.15.0)
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# autoDCR
|
|
24
|
+
#### Jamie Heather, MGH, 2025
|
|
25
|
+
|
|
26
|
+
[](https://pypi.org/project/autoDCR/) [](https://opensource.org/licenses/MIT) 
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
`autoDCR` (short for **auto**matic **D**e**c**ombinato**r**) is a python script to perform T cell receptor (TCR) gene annotation. This is inspired by and in part built upon the core functionality of [Decombinator](https://github.com/innate2adaptive/Decombinator), the TCR analysis software developed by the Chain lab at UCL. It uses a similar conceptual framework of using fast Aho-Corasick tries to search for the presence of 'tag' sequences in DNA reads, and use these to identify V and J TCR genes. However it applies that core concept in different ways, to perform several niche functions that are not well catered to in other TCR annotation pipelines.
|
|
30
|
+
|
|
31
|
+
**Note** that `autoDCR` is under development and should be considered experimental, specifically aiming to cater to specific case uses. [The documentation can be found here: https://jamieheather.github.io/autoDCR/](https://jamieheather.github.io/autoDCR/).
|
|
32
|
+
|
|
33
|
+
The 0.2.7 version used in prior publications [can be accessed via the releases page](https://github.com/JamieHeather/autoDCR/releases/tag/v0.2.7).
|
autodcr-0.3.0/README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# autoDCR
|
|
2
|
+
#### Jamie Heather, MGH, 2025
|
|
3
|
+
|
|
4
|
+
[](https://pypi.org/project/autoDCR/) [](https://opensource.org/licenses/MIT) 
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
`autoDCR` (short for **auto**matic **D**e**c**ombinato**r**) is a python script to perform T cell receptor (TCR) gene annotation. This is inspired by and in part built upon the core functionality of [Decombinator](https://github.com/innate2adaptive/Decombinator), the TCR analysis software developed by the Chain lab at UCL. It uses a similar conceptual framework of using fast Aho-Corasick tries to search for the presence of 'tag' sequences in DNA reads, and use these to identify V and J TCR genes. However it applies that core concept in different ways, to perform several niche functions that are not well catered to in other TCR annotation pipelines.
|
|
8
|
+
|
|
9
|
+
**Note** that `autoDCR` is under development and should be considered experimental, specifically aiming to cater to specific case uses. [The documentation can be found here: https://jamieheather.github.io/autoDCR/](https://jamieheather.github.io/autoDCR/).
|
|
10
|
+
|
|
11
|
+
The 0.2.7 version used in prior publications [can be accessed via the releases page](https://github.com/JamieHeather/autoDCR/releases/tag/v0.2.7).
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "autoDCR"
|
|
3
|
+
version = "0.3.0"
|
|
4
|
+
description = "Automatic DCR: a modified version of Decombinator for uncommon specific TCR annotation-related tasks"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Jamie Heather",email = "jheather@mgh.harvard.edu"}
|
|
7
|
+
]
|
|
8
|
+
license = {text = "MIT"}
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"typer (>=0.15.0)",
|
|
13
|
+
"acora (>=2.4)",
|
|
14
|
+
"imgtgenedl (>=0.6.1)",
|
|
15
|
+
"pandas (>=2.0.0)",
|
|
16
|
+
"seaborn (>=0.13.0)",
|
|
17
|
+
"matplotlib (>=3.8.0)"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[tool.poetry]
|
|
21
|
+
packages = [{include = "autoDCRdata", from = "src"},
|
|
22
|
+
{include = "autoDCRscripts", from = "src"}]
|
|
23
|
+
|
|
24
|
+
[tool.poetry.scripts]
|
|
25
|
+
autoDCR = "autoDCRscripts.main:app"
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
29
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"V": {"1": ["[AST][AGS].Y[FILY].", "[AGS].Y[FILY].", "[AST]..Y[FILY].", "[AST][AGS]..[FILY].", "[AST][AGS].Y..", "[AST][AGS].Y[FILY]."], "2": ["Y[FILY]C", "Y.C", "C"]}, "J": {"1": ["FG.GT.[LV]", ".G.GT.[LV]", "F..GT.[LV]", "FG..T.[LV]", "FG.G..[LV]", "FG.GT."], "2": ["FG.G", "WG.G", "CG.G", "F..G", "FG..", "LG.G"]}}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import collections as coll
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from time import time
|
|
5
|
+
from . import autoDCRfunctions as fxn
|
|
6
|
+
|
|
7
|
+
def vjcdr3_annotate(mode, in_path, out_path, species, loci, orientation, protein, barcoding,
|
|
8
|
+
deletion_limit, cdr3_limit, dont_translate, dont_gzip, data_dir):
|
|
9
|
+
"""
|
|
10
|
+
# TODO docstring
|
|
11
|
+
:param in_file:
|
|
12
|
+
:param out_path:
|
|
13
|
+
:param species:
|
|
14
|
+
:param loci:
|
|
15
|
+
:param orientation:
|
|
16
|
+
:param deletion_limit:
|
|
17
|
+
:param cdr3_limit:
|
|
18
|
+
:param dont_translate:
|
|
19
|
+
:param dont_gzip:
|
|
20
|
+
:return:
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
# Establish the necessary input data and parameters
|
|
24
|
+
if not os.path.exists(in_path):
|
|
25
|
+
raise IOError(f"Unable to find input file for TCR annotation ({in_path})!")
|
|
26
|
+
# TODO sanity/presence check the input FQ (including a length check - give warning if too short)
|
|
27
|
+
|
|
28
|
+
mode = mode.upper()
|
|
29
|
+
|
|
30
|
+
loci = fxn.check_features(loci, 'loci')
|
|
31
|
+
if not protein:
|
|
32
|
+
mol_type = 'nt'
|
|
33
|
+
else:
|
|
34
|
+
mol_type = 'aa'
|
|
35
|
+
|
|
36
|
+
reference_data = fxn.import_tcr_info(species, loci, 'JV', mol_type, data_dir)
|
|
37
|
+
reference_data = fxn.import_translate_info(reference_data)
|
|
38
|
+
headers = fxn.out_headers
|
|
39
|
+
if mode == 'FULL':
|
|
40
|
+
extra_refdat = fxn.import_tcr_info(species, loci, 'CL', 'nt', data_dir)
|
|
41
|
+
for field in fxn.full_feat_headers:
|
|
42
|
+
headers.insert(4, field)
|
|
43
|
+
|
|
44
|
+
dcr_parameters = {'mode': mode,
|
|
45
|
+
'orientation': orientation,
|
|
46
|
+
'deletion_limit': deletion_limit,
|
|
47
|
+
'cdr3_limit': cdr3_limit,
|
|
48
|
+
'mol_type': mol_type}
|
|
49
|
+
# TODO add in don't translate?
|
|
50
|
+
|
|
51
|
+
# Determine where to save the results
|
|
52
|
+
if out_path == '[input-file-name].tsv':
|
|
53
|
+
out_path = in_path[:in_path.rfind('.')].split('/')[-1] + '.tsv'
|
|
54
|
+
elif not out_path.endswith('.tsv'):
|
|
55
|
+
out_path += '.tsv'
|
|
56
|
+
if not dont_gzip:
|
|
57
|
+
out_path += '.gz'
|
|
58
|
+
|
|
59
|
+
counts = coll.Counter()
|
|
60
|
+
start = time()
|
|
61
|
+
|
|
62
|
+
# Then loop through the input file, analysing TCRs as we go
|
|
63
|
+
with fxn.opener(in_path, 'r') as in_file, fxn.opener(out_path, 'w') as out_file:
|
|
64
|
+
|
|
65
|
+
# Initialise the output file with the header
|
|
66
|
+
out_file.write('\t'.join(headers) + '\n')
|
|
67
|
+
out_str = []
|
|
68
|
+
for read_id, seq, qual in fxn.readfq(in_file):
|
|
69
|
+
|
|
70
|
+
# Pad empty quality scores for FASTA files
|
|
71
|
+
if not qual:
|
|
72
|
+
qual = ' ' * len(seq)
|
|
73
|
+
|
|
74
|
+
counts['reads'] += 1
|
|
75
|
+
|
|
76
|
+
# Figure out the relevant parts of the read for decombining, then search
|
|
77
|
+
read, read_qual, bc, bc_qual = fxn.sort_read_bits(seq, qual, '') # TODO add barcoding
|
|
78
|
+
|
|
79
|
+
# TODO break it down into different functions
|
|
80
|
+
# TODO 1) find tags 2) call rearrangements 3) translate
|
|
81
|
+
|
|
82
|
+
tcr_check = fxn.dcr(read, read_qual, reference_data, dcr_parameters, headers)
|
|
83
|
+
|
|
84
|
+
if tcr_check:
|
|
85
|
+
|
|
86
|
+
# TODO barcoding
|
|
87
|
+
# if input_args['barcoding']:
|
|
88
|
+
# tcr_check['umi_seq'] = bc
|
|
89
|
+
# tcr_check['umi_qual'] = bc_qual
|
|
90
|
+
|
|
91
|
+
counts['rearrangements'] += 1
|
|
92
|
+
tcr_check['sequence_id'] = read_id
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# # TODO full - l+c search
|
|
98
|
+
if mode == 'FULL':
|
|
99
|
+
tcr_check = fxn.find_full_feats(tcr_check, extra_refdat, dcr_parameters)
|
|
100
|
+
|
|
101
|
+
# Remove in-process gene region labeling
|
|
102
|
+
tcr_check = fxn.tidy_output(tcr_check, headers)
|
|
103
|
+
|
|
104
|
+
line_out = '\t'.join([str(tcr_check[x]) for x in headers])
|
|
105
|
+
|
|
106
|
+
# TODO move
|
|
107
|
+
# if discover:
|
|
108
|
+
# if tcr_check['v_mismatches'] or tcr_check['j_mismatches']:
|
|
109
|
+
# counts['mismatched_germlines'] += 1
|
|
110
|
+
|
|
111
|
+
out_str.append(line_out)
|
|
112
|
+
|
|
113
|
+
# Bulk write the results out once there's a sufficient chunk (to prevent this getting too big in memory)
|
|
114
|
+
if len(out_str) % 10000 == 0:
|
|
115
|
+
out_file.write('\n'.join(out_str) + '\n')
|
|
116
|
+
out_str = []
|
|
117
|
+
|
|
118
|
+
# Then write out any leftover calls
|
|
119
|
+
out_file.write('\n'.join(out_str))
|
|
120
|
+
# TODO fix duplicate counts (if desired?)
|
|
121
|
+
|
|
122
|
+
end = time()
|
|
123
|
+
time_taken = end - start
|
|
124
|
+
print("Took", str(round(time_taken, 2)), "seconds")
|
|
125
|
+
print("Found", str(counts['rearrangements']), "rearranged TCRs in", str(counts['reads']), "reads "
|
|
126
|
+
"(~" + str(round(counts['rearrangements']/counts['reads'] * 100)) +"%)")
|
|
127
|
+
|
|
128
|
+
# if discover:
|
|
129
|
+
# print("Of these,", str(counts['mismatched_germlines']), "showed discontinuous tag matches "
|
|
130
|
+
# "and were kept aside for inference of potential new alleles.")
|
|
131
|
+
|
|
132
|
+
# TODO sort summary output (maybe into YAML or JSON?)
|