autoDCR 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
autodcr-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.3
2
+ Name: autoDCR
3
+ Version: 0.3.0
4
+ Summary: Automatic DCR: a modified version of Decombinator for uncommon specific TCR annotation-related tasks
5
+ License: MIT
6
+ Author: Jamie Heather
7
+ Author-email: jheather@mgh.harvard.edu
8
+ Requires-Python: >=3.10
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: acora (>=2.4)
16
+ Requires-Dist: imgtgenedl (>=0.6.1)
17
+ Requires-Dist: matplotlib (>=3.8.0)
18
+ Requires-Dist: pandas (>=2.0.0)
19
+ Requires-Dist: seaborn (>=0.13.0)
20
+ Requires-Dist: typer (>=0.15.0)
21
+ Description-Content-Type: text/markdown
22
+
23
+ # autoDCR
24
+ #### Jamie Heather, MGH, 2025
25
+
26
+ [![PyPI - Version](https://img.shields.io/pypi/v/autoDCR?color=%239467bd)](https://pypi.org/project/autoDCR/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) ![Static Badge](https://img.shields.io/badge/experimental%20release-8A2BE2)
27
+
28
+
29
+ `autoDCR` (short for **auto**matic **D**e**c**ombinato**r**) is a python script to perform T cell receptor (TCR) gene annotation. This is inspired by and in part built upon the core functionality of [Decombinator](https://github.com/innate2adaptive/Decombinator), the TCR analysis software developed by the Chain lab at UCL. It uses a similar conceptual framework of using fast Aho-Corasick tries to search for the presence of 'tag' sequences in DNA reads, and use these to identify V and J TCR genes. However it applies that core concept in different ways, to perform several niche functions that are not well catered to in other TCR annotation pipelines.
30
+
31
+ **Note** that `autoDCR` is under development and should be considered experimental, specifically aiming to cater to specific case uses. [The documentation can be found here: https://jamieheather.github.io/autoDCR/](https://jamieheather.github.io/autoDCR/).
32
+
33
+ The 0.2.7 version used in prior publications [can be accessed via the releases page](https://github.com/JamieHeather/autoDCR/releases/tag/v0.2.7).
@@ -0,0 +1,11 @@
1
+ # autoDCR
2
+ #### Jamie Heather, MGH, 2025
3
+
4
+ [![PyPI - Version](https://img.shields.io/pypi/v/autoDCR?color=%239467bd)](https://pypi.org/project/autoDCR/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) ![Static Badge](https://img.shields.io/badge/experimental%20release-8A2BE2)
5
+
6
+
7
+ `autoDCR` (short for **auto**matic **D**e**c**ombinato**r**) is a python script to perform T cell receptor (TCR) gene annotation. This is inspired by and in part built upon the core functionality of [Decombinator](https://github.com/innate2adaptive/Decombinator), the TCR analysis software developed by the Chain lab at UCL. It uses a similar conceptual framework of using fast Aho-Corasick tries to search for the presence of 'tag' sequences in DNA reads, and use these to identify V and J TCR genes. However it applies that core concept in different ways, to perform several niche functions that are not well catered to in other TCR annotation pipelines.
8
+
9
+ **Note** that `autoDCR` is under development and should be considered experimental, specifically aiming to cater to specific case uses. [The documentation can be found here: https://jamieheather.github.io/autoDCR/](https://jamieheather.github.io/autoDCR/).
10
+
11
+ The 0.2.7 version used in prior publications [can be accessed via the releases page](https://github.com/JamieHeather/autoDCR/releases/tag/v0.2.7).
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "autoDCR"
3
+ version = "0.3.0"
4
+ description = "Automatic DCR: a modified version of Decombinator for uncommon specific TCR annotation-related tasks"
5
+ authors = [
6
+ {name = "Jamie Heather",email = "jheather@mgh.harvard.edu"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "typer (>=0.15.0)",
13
+ "acora (>=2.4)",
14
+ "imgtgenedl (>=0.6.1)",
15
+ "pandas (>=2.0.0)",
16
+ "seaborn (>=0.13.0)",
17
+ "matplotlib (>=3.8.0)"
18
+ ]
19
+
20
+ [tool.poetry]
21
+ packages = [{include = "autoDCRdata", from = "src"},
22
+ {include = "autoDCRscripts", from = "src"}]
23
+
24
+ [tool.poetry.scripts]
25
+ autoDCR = "autoDCRscripts.main:app"
26
+
27
+ [build-system]
28
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
29
+ build-backend = "poetry.core.masonry.api"
File without changes
@@ -0,0 +1 @@
1
+ {"V": {"1": ["[AST][AGS].Y[FILY].", "[AGS].Y[FILY].", "[AST]..Y[FILY].", "[AST][AGS]..[FILY].", "[AST][AGS].Y..", "[AST][AGS].Y[FILY]."], "2": ["Y[FILY]C", "Y.C", "C"]}, "J": {"1": ["FG.GT.[LV]", ".G.GT.[LV]", "F..GT.[LV]", "FG..T.[LV]", "FG.G..[LV]", "FG.GT."], "2": ["FG.G", "WG.G", "CG.G", "F..G", "FG..", "LG.G"]}}
@@ -0,0 +1,7 @@
1
+ from importlib.metadata import version, PackageNotFoundError
2
+
3
+ # Whole-package versioning
4
+ try:
5
+ __version__ = version('autoDCR')
6
+ except PackageNotFoundError:
7
+ __version__ = "unknown"
@@ -0,0 +1,132 @@
1
+ import collections as coll
2
+ import os
3
+ import sys
4
+ from time import time
5
+ from . import autoDCRfunctions as fxn
6
+
7
+ def vjcdr3_annotate(mode, in_path, out_path, species, loci, orientation, protein, barcoding,
8
+ deletion_limit, cdr3_limit, dont_translate, dont_gzip, data_dir):
9
+ """
10
+ # TODO docstring
11
+ :param in_file:
12
+ :param out_path:
13
+ :param species:
14
+ :param loci:
15
+ :param orientation:
16
+ :param deletion_limit:
17
+ :param cdr3_limit:
18
+ :param dont_translate:
19
+ :param dont_gzip:
20
+ :return:
21
+ """
22
+
23
+ # Establish the necessary input data and parameters
24
+ if not os.path.exists(in_path):
25
+ raise IOError(f"Unable to find input file for TCR annotation ({in_path})!")
26
+ # TODO sanity/presence check the input FQ (including a length check - give warning if too short)
27
+
28
+ mode = mode.upper()
29
+
30
+ loci = fxn.check_features(loci, 'loci')
31
+ if not protein:
32
+ mol_type = 'nt'
33
+ else:
34
+ mol_type = 'aa'
35
+
36
+ reference_data = fxn.import_tcr_info(species, loci, 'JV', mol_type, data_dir)
37
+ reference_data = fxn.import_translate_info(reference_data)
38
+ headers = fxn.out_headers
39
+ if mode == 'FULL':
40
+ extra_refdat = fxn.import_tcr_info(species, loci, 'CL', 'nt', data_dir)
41
+ for field in fxn.full_feat_headers:
42
+ headers.insert(4, field)
43
+
44
+ dcr_parameters = {'mode': mode,
45
+ 'orientation': orientation,
46
+ 'deletion_limit': deletion_limit,
47
+ 'cdr3_limit': cdr3_limit,
48
+ 'mol_type': mol_type}
49
+ # TODO add in don't translate?
50
+
51
+ # Determine where to save the results
52
+ if out_path == '[input-file-name].tsv':
53
+ out_path = in_path[:in_path.rfind('.')].split('/')[-1] + '.tsv'
54
+ elif not out_path.endswith('.tsv'):
55
+ out_path += '.tsv'
56
+ if not dont_gzip:
57
+ out_path += '.gz'
58
+
59
+ counts = coll.Counter()
60
+ start = time()
61
+
62
+ # Then loop through the input file, analysing TCRs as we go
63
+ with fxn.opener(in_path, 'r') as in_file, fxn.opener(out_path, 'w') as out_file:
64
+
65
+ # Initialise the output file with the header
66
+ out_file.write('\t'.join(headers) + '\n')
67
+ out_str = []
68
+ for read_id, seq, qual in fxn.readfq(in_file):
69
+
70
+ # Pad empty quality scores for FASTA files
71
+ if not qual:
72
+ qual = ' ' * len(seq)
73
+
74
+ counts['reads'] += 1
75
+
76
+ # Figure out the relevant parts of the read for decombining, then search
77
+ read, read_qual, bc, bc_qual = fxn.sort_read_bits(seq, qual, '') # TODO add barcoding
78
+
79
+ # TODO break it down into different functions
80
+ # TODO 1) find tags 2) call rearrangements 3) translate
81
+
82
+ tcr_check = fxn.dcr(read, read_qual, reference_data, dcr_parameters, headers)
83
+
84
+ if tcr_check:
85
+
86
+ # TODO barcoding
87
+ # if input_args['barcoding']:
88
+ # tcr_check['umi_seq'] = bc
89
+ # tcr_check['umi_qual'] = bc_qual
90
+
91
+ counts['rearrangements'] += 1
92
+ tcr_check['sequence_id'] = read_id
93
+
94
+
95
+
96
+
97
+ # # TODO full - l+c search
98
+ if mode == 'FULL':
99
+ tcr_check = fxn.find_full_feats(tcr_check, extra_refdat, dcr_parameters)
100
+
101
+ # Remove in-process gene region labeling
102
+ tcr_check = fxn.tidy_output(tcr_check, headers)
103
+
104
+ line_out = '\t'.join([str(tcr_check[x]) for x in headers])
105
+
106
+ # TODO move
107
+ # if discover:
108
+ # if tcr_check['v_mismatches'] or tcr_check['j_mismatches']:
109
+ # counts['mismatched_germlines'] += 1
110
+
111
+ out_str.append(line_out)
112
+
113
+ # Bulk write the results out once there's a sufficient chunk (to prevent this getting too big in memory)
114
+ if len(out_str) % 10000 == 0:
115
+ out_file.write('\n'.join(out_str) + '\n')
116
+ out_str = []
117
+
118
+ # Then write out any leftover calls
119
+ out_file.write('\n'.join(out_str))
120
+ # TODO fix duplicate counts (if desired?)
121
+
122
+ end = time()
123
+ time_taken = end - start
124
+ print("Took", str(round(time_taken, 2)), "seconds")
125
+ print("Found", str(counts['rearrangements']), "rearranged TCRs in", str(counts['reads']), "reads "
126
+ "(~" + str(round(counts['rearrangements']/counts['reads'] * 100)) +"%)")
127
+
128
+ # if discover:
129
+ # print("Of these,", str(counts['mismatched_germlines']), "showed discontinuous tag matches "
130
+ # "and were kept aside for inference of potential new alleles.")
131
+
132
+ # TODO sort summary output (maybe into YAML or JSON?)