jsrc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jsrc-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 JiaoYuan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
jsrc-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: jsrc
3
+ Version: 0.1.0
4
+ Summary: Bioinformatics and phenotype analysis toolkit
5
+ Author-email: Jiaoyuan <your.email@example.com>
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: biopython>=1.80
11
+ Requires-Dist: matplotlib>=3.5
12
+ Requires-Dist: numpy>=1.20
13
+ Requires-Dist: opencv-python>=4.5
14
+ Requires-Dist: pandas>=1.3
15
+ Requires-Dist: pillow>=9.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7.0; extra == "dev"
18
+ Requires-Dist: black>=22.0; extra == "dev"
19
+ Requires-Dist: ruff>=0.1; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # jsrc
23
+
24
+ Bioinformatics and phenotype analysis toolkit
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install jsrc
30
+ ```
31
+
32
+ Or install from source:
33
+
34
+ ```bash
35
+ git clone https://github.com/imjiaoyuan/jsrc.git
36
+ cd jsrc
37
+ pip install -e .
38
+ ```
39
+
40
+ ## Commands
41
+
42
+ **Sequence Operations**
43
+
44
+ ```bash
45
+ jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
46
+ jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
47
+ jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
48
+ jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
49
+ ```
50
+
51
+ **Visualization**
52
+
53
+ ```bash
54
+ jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
55
+ jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
56
+ jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
57
+ jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
58
+ jsrc plot cis-element -bed elements.bed -o cis_elements.png
59
+ ```
60
+
61
+ **Analysis Tools**
62
+
63
+ ```bash
64
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
65
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
66
+ jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
67
+ ```
68
+
69
+ **Phenotype Image Analysis**
70
+
71
+ ```bash
72
+ jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
73
+ jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
74
+ jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
75
+ jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
76
+ ```
jsrc-0.1.0/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # jsrc
2
+
3
+ Bioinformatics and phenotype analysis toolkit
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install jsrc
9
+ ```
10
+
11
+ Or install from source:
12
+
13
+ ```bash
14
+ git clone https://github.com/imjiaoyuan/jsrc.git
15
+ cd jsrc
16
+ pip install -e .
17
+ ```
18
+
19
+ ## Commands
20
+
21
+ **Sequence Operations**
22
+
23
+ ```bash
24
+ jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
25
+ jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
26
+ jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
27
+ jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
28
+ ```
29
+
30
+ **Visualization**
31
+
32
+ ```bash
33
+ jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
34
+ jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
35
+ jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
36
+ jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
37
+ jsrc plot cis-element -bed elements.bed -o cis_elements.png
38
+ ```
39
+
40
+ **Analysis Tools**
41
+
42
+ ```bash
43
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
44
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
45
+ jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
46
+ ```
47
+
48
+ **Phenotype Image Analysis**
49
+
50
+ ```bash
51
+ jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
52
+ jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
53
+ jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
54
+ jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
55
+ ```
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "jsrc"
3
+ version = "0.1.0"
4
+ description = "Bioinformatics and phenotype analysis toolkit"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Jiaoyuan", email = "your.email@example.com"}
10
+ ]
11
+
12
+ dependencies = [
13
+ "biopython>=1.80",
14
+ "matplotlib>=3.5",
15
+ "numpy>=1.20",
16
+ "opencv-python>=4.5",
17
+ "pandas>=1.3",
18
+ "pillow>=9.0",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = ["pytest>=7.0", "black>=22.0", "ruff>=0.1"]
23
+
24
+ [project.scripts]
25
+ jsrc = "cli:main"
26
+
27
+ [build-system]
28
+ requires = ["setuptools>=61.0", "wheel"]
29
+ build-backend = "setuptools.build_meta"
30
+
31
+ [tool.setuptools]
32
+ py-modules = ["cli", "common", "seq", "plot", "analyze", "pheno"]
33
+
34
+ [tool.setuptools.package-dir]
35
+ "" = "src"
jsrc-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,56 @@
1
+ import sys
2
+ import os
3
+ import tempfile
4
+ from Bio import SeqIO, AlignIO, Phylo
5
+ from common import sanitize_fasta_ids, check_external_tool
6
+
7
+ def cmd_phylo_tree(args):
8
+ check_external_tool('mafft', 'conda install -c bioconda mafft')
9
+
10
+ if args.a == 'ml':
11
+ check_external_tool('FastTree', 'conda install -c bioconda fasttree')
12
+
13
+ with tempfile.TemporaryDirectory() as tmpdir:
14
+ clean_fa = os.path.join(tmpdir, 'clean.fa')
15
+ aln_fa = os.path.join(tmpdir, 'aligned.fa')
16
+
17
+ id_map = sanitize_fasta_ids(args.fa, clean_fa)
18
+
19
+ os.system(f'mafft --auto --quiet {clean_fa} > {aln_fa}')
20
+
21
+ if args.a == 'nj':
22
+ from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
23
+
24
+ alignment = AlignIO.read(aln_fa, 'fasta')
25
+ calculator = DistanceCalculator('identity')
26
+ dm = calculator.get_distance(alignment)
27
+ constructor = DistanceTreeConstructor(calculator)
28
+ tree = constructor.nj(dm)
29
+
30
+ else:
31
+ tree_file = os.path.join(tmpdir, 'tree.nwk')
32
+ os.system(f'FastTree -nt -quiet {aln_fa} > {tree_file}')
33
+ tree = Phylo.read(tree_file, 'newick')
34
+
35
+ for clade in tree.find_clades():
36
+ if clade.name and clade.name in id_map:
37
+ clade.name = id_map[clade.name]
38
+
39
+ Phylo.write(tree, args.o, 'newick')
40
+
41
+ print(f"Phylogenetic tree ({args.a}) saved to {args.o}")
42
+
43
+ def cmd_motif(args):
44
+ check_external_tool('meme', 'conda install -c bioconda meme')
45
+
46
+ os.makedirs(args.o, exist_ok=True)
47
+
48
+ cmd = f'meme {args.fa} -dna -oc {args.o} -nmotifs {args.nmotifs} -minw {args.minw} -maxw {args.maxw} -mod zoops'
49
+
50
+ ret = os.system(cmd)
51
+
52
+ if ret != 0:
53
+ print(f"Error: MEME failed with exit code {ret}", file=sys.stderr)
54
+ sys.exit(1)
55
+
56
+ print(f"Motif analysis complete. Results in {args.o}")
jsrc-0.1.0/src/cli.py ADDED
@@ -0,0 +1,140 @@
1
+ import argparse
2
+ import sys
3
+ import seq, plot, analyze, pheno
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser(
7
+ prog='jsrc',
8
+ description='Bioinformatics and phenotype analysis toolkit'
9
+ )
10
+ parser.add_argument('--version', action='version', version='1.0.0')
11
+
12
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
13
+
14
+ seq_parser = subparsers.add_parser('seq', help='Sequence operations')
15
+ seq_sub = seq_parser.add_subparsers(dest='seq_cmd')
16
+
17
+ p = seq_sub.add_parser('extract', help='Extract sequences by ID list')
18
+ p.add_argument('-fa', required=True, help='Input FASTA file')
19
+ p.add_argument('-ids', required=True, help='ID list file (one per line)')
20
+ p.add_argument('-o', required=True, help='Output FASTA file')
21
+ p.set_defaults(func=seq.cmd_extract)
22
+
23
+ p = seq_sub.add_parser('rename', help='Rename sequences using mapping file')
24
+ p.add_argument('-fa', required=True, help='Input FASTA file')
25
+ p.add_argument('-map', required=True, help='ID mapping file (CSV: old,new)')
26
+ p.add_argument('-o', required=True, help='Output FASTA file')
27
+ p.set_defaults(func=seq.cmd_rename)
28
+
29
+ p = seq_sub.add_parser('rename-by-gff', help='Rename sequences based on GFF')
30
+ p.add_argument('-fa', required=True, help='Input FASTA file')
31
+ p.add_argument('-gff', required=True, help='GFF annotation file')
32
+ p.add_argument('-parent', required=True, help='Parent attribute field name')
33
+ p.add_argument('-o', required=True, help='Output FASTA file')
34
+ p.set_defaults(func=seq.cmd_rename_by_gff)
35
+
36
+ p = seq_sub.add_parser('translate', help='Extract CDS and translate to protein')
37
+ p.add_argument('-fa', required=True, help='Genome FASTA file')
38
+ p.add_argument('-gff', required=True, help='GFF annotation file')
39
+ p.add_argument('-id', required=True, help='Gene ID field in GFF')
40
+ p.add_argument('-o', required=True, help='Output protein FASTA')
41
+ p.set_defaults(func=seq.cmd_translate)
42
+
43
+ plot_parser = subparsers.add_parser('plot', help='Visualization')
44
+ plot_sub = plot_parser.add_subparsers(dest='plot_cmd')
45
+
46
+ p = plot_sub.add_parser('gene-structure', help='Plot gene structure diagram')
47
+ p.add_argument('-gff', required=True, help='GFF annotation file')
48
+ p.add_argument('-ids', required=True, help='Gene ID list file')
49
+ p.add_argument('-o', required=True, help='Output PNG file')
50
+ p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
51
+ p.set_defaults(func=plot.cmd_gene_structure)
52
+
53
+ p = plot_sub.add_parser('exon-structure', help='Plot exon structure')
54
+ p.add_argument('-gff', required=True, help='GFF annotation file')
55
+ p.add_argument('-ids', required=True, help='Gene ID list file')
56
+ p.add_argument('-o', required=True, help='Output PNG file')
57
+ p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
58
+ p.set_defaults(func=plot.cmd_exon_structure)
59
+
60
+ p = plot_sub.add_parser('chromosome-map', help='Plot chromosome map')
61
+ p.add_argument('-gff', required=True, help='GFF annotation file')
62
+ p.add_argument('-o', required=True, help='Output PNG file')
63
+ p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
64
+ p.set_defaults(func=plot.cmd_chromosome_map)
65
+
66
+ p = plot_sub.add_parser('protein-domain', help='Plot protein domain architecture')
67
+ p.add_argument('-tsv', required=True, help='Domain TSV file')
68
+ p.add_argument('-o', required=True, help='Output PNG file')
69
+ p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
70
+ p.set_defaults(func=plot.cmd_protein_domain)
71
+
72
+ p = plot_sub.add_parser('cis-element', help='Plot cis-regulatory elements')
73
+ p.add_argument('-bed', required=True, help='BED file with elements')
74
+ p.add_argument('-o', required=True, help='Output PNG file')
75
+ p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
76
+ p.set_defaults(func=plot.cmd_cis_element)
77
+
78
+ analyze_parser = subparsers.add_parser('analyze', help='Analysis tools')
79
+ analyze_sub = analyze_parser.add_subparsers(dest='analyze_cmd')
80
+
81
+ p = analyze_sub.add_parser('phylo-tree', help='Build phylogenetic tree')
82
+ p.add_argument('-fa', required=True, help='Input FASTA file')
83
+ p.add_argument('-o', required=True, help='Output tree file')
84
+ p.add_argument('-a', choices=['nj', 'ml'], default='nj', help='Algorithm (default: nj)')
85
+ p.set_defaults(func=analyze.cmd_phylo_tree)
86
+
87
+ p = analyze_sub.add_parser('motif', help='Find motifs using MEME')
88
+ p.add_argument('-fa', required=True, help='Input FASTA file')
89
+ p.add_argument('-o', required=True, help='Output directory')
90
+ p.add_argument('-nmotifs', type=int, default=5, help='Number of motifs (default: 5)')
91
+ p.add_argument('-minw', type=int, default=6, help='Min motif width (default: 6)')
92
+ p.add_argument('-maxw', type=int, default=50, help='Max motif width (default: 50)')
93
+ p.set_defaults(func=analyze.cmd_motif)
94
+
95
+ pheno_parser = subparsers.add_parser('pheno', help='Phenotype image analysis')
96
+ pheno_sub = pheno_parser.add_subparsers(dest='pheno_cmd')
97
+
98
+ p = pheno_sub.add_parser('split-fruit', help='Segment fruit objects')
99
+ p.add_argument('-i', required=True, help='Input image file')
100
+ p.add_argument('-o', required=True, help='Output directory')
101
+ p.add_argument('-size', type=int, default=800, help='Target size (default: 800)')
102
+ p.set_defaults(func=pheno.cmd_split_fruit)
103
+
104
+ p = pheno_sub.add_parser('split-fruit-raw', help='Segment fruit without resizing')
105
+ p.add_argument('-i', required=True, help='Input image file')
106
+ p.add_argument('-o', required=True, help='Output directory')
107
+ p.set_defaults(func=pheno.cmd_split_fruit_raw)
108
+
109
+ p = pheno_sub.add_parser('split-leaf', help='Segment leaf objects')
110
+ p.add_argument('-i', required=True, help='Input image file')
111
+ p.add_argument('-o', required=True, help='Output directory')
112
+ p.add_argument('-size', type=int, default=800, help='Target size (default: 800)')
113
+ p.set_defaults(func=pheno.cmd_split_leaf)
114
+
115
+ p = pheno_sub.add_parser('split-leaf-edge', help='Extract leaf edges')
116
+ p.add_argument('-i', required=True, help='Input image file')
117
+ p.add_argument('-o', required=True, help='Output directory')
118
+ p.set_defaults(func=pheno.cmd_split_leaf_edge)
119
+
120
+ args = parser.parse_args()
121
+
122
+ if not args.command:
123
+ parser.print_help()
124
+ sys.exit(1)
125
+
126
+ if hasattr(args, 'func'):
127
+ args.func(args)
128
+ else:
129
+ if args.command == 'seq' and not args.seq_cmd:
130
+ seq_parser.print_help()
131
+ elif args.command == 'plot' and not args.plot_cmd:
132
+ plot_parser.print_help()
133
+ elif args.command == 'analyze' and not args.analyze_cmd:
134
+ analyze_parser.print_help()
135
+ elif args.command == 'pheno' and not args.pheno_cmd:
136
+ pheno_parser.print_help()
137
+ sys.exit(1)
138
+
139
+ if __name__ == '__main__':
140
+ main()
@@ -0,0 +1,135 @@
1
+ import re
2
+ import os
3
+ import shutil
4
+ from typing import Dict, List, Tuple, Optional
5
+ import cv2
6
+ import numpy as np
7
+
8
+ def parse_gff_attributes(attr_string: str) -> Dict[str, str]:
9
+ attrs = {}
10
+ for item in attr_string.strip().strip(';').split(';'):
11
+ if '=' in item:
12
+ key, value = item.strip().split('=', 1)
13
+ attrs[key] = value.strip('"')
14
+ elif ' ' in item:
15
+ parts = item.strip().split(None, 1)
16
+ if len(parts) == 2:
17
+ attrs[parts[0]] = parts[1].strip('"')
18
+ return attrs
19
+
20
+ def get_gene_structure(gff_file: str, gene_ids: List[str],
21
+ feature_types: Optional[List[str]] = None) -> Dict:
22
+ if feature_types is None:
23
+ feature_types = ['CDS', 'exon']
24
+
25
+ target_set = set(gene_ids)
26
+ valid_mrna = {}
27
+ coords = {tid: [] for tid in gene_ids}
28
+
29
+ with open(gff_file, 'r') as f:
30
+ for line in f:
31
+ if line.startswith('#'):
32
+ continue
33
+ parts = line.strip().split('\t')
34
+ if len(parts) < 9:
35
+ continue
36
+
37
+ ftype = parts[2]
38
+ attr = parse_gff_attributes(parts[8])
39
+
40
+ if ftype == 'mRNA':
41
+ pid = attr.get('Parent')
42
+ mid = attr.get('ID')
43
+ if pid in target_set:
44
+ valid_mrna[mid] = pid
45
+
46
+ elif ftype in feature_types:
47
+ pid = attr.get('Parent')
48
+ if pid in valid_mrna:
49
+ gid = valid_mrna[pid]
50
+ coords[gid].append((int(parts[3]), int(parts[4])))
51
+ elif pid in target_set:
52
+ coords[pid].append((int(parts[3]), int(parts[4])))
53
+
54
+ return coords
55
+
56
+ def read_fasta_ids(fasta_file: str) -> List[str]:
57
+ ids = []
58
+ with open(fasta_file, 'r') as f:
59
+ for line in f:
60
+ if line.startswith('>'):
61
+ ids.append(line[1:].split()[0])
62
+ return ids
63
+
64
+ def sanitize_fasta_ids(input_file: str, output_file: str) -> Dict[str, str]:
65
+ mapping = {}
66
+ with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
67
+ for line in fin:
68
+ if line.startswith('>'):
69
+ old_id = line[1:].strip().split()[0]
70
+ new_id = re.sub(r'[^A-Za-z0-9_-]', '_', old_id)
71
+ mapping[new_id] = old_id
72
+ fout.write(f'>{new_id}\n')
73
+ else:
74
+ fout.write(line)
75
+ return mapping
76
+
77
+ def setup_matplotlib():
78
+ import matplotlib
79
+ matplotlib.use('Agg')
80
+ import matplotlib.pyplot as plt
81
+ return plt
82
+
83
+ def natural_sort_key(s):
84
+ return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
85
+
86
+ def check_external_tool(tool_name: str, install_hint: str = None):
87
+ if not shutil.which(tool_name):
88
+ msg = f"Error: {tool_name} not found in PATH"
89
+ if install_hint:
90
+ msg += f"\nInstall with: {install_hint}"
91
+ raise RuntimeError(msg)
92
+
93
+ def apply_morphology(mask, operation='close', kernel_size=5):
94
+ kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
95
+ if operation == 'close':
96
+ return cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
97
+ elif operation == 'open':
98
+ return cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
99
+ elif operation == 'dilate':
100
+ return cv2.dilate(mask, kernel)
101
+ elif operation == 'erode':
102
+ return cv2.erode(mask, kernel)
103
+ return mask
104
+
105
+ def filter_contours(contours, min_area=100, max_area=None):
106
+ filtered = []
107
+ for cnt in contours:
108
+ area = cv2.contourArea(cnt)
109
+ if area < min_area:
110
+ continue
111
+ if max_area and area > max_area:
112
+ continue
113
+ filtered.append(cnt)
114
+ return filtered
115
+
116
+ def extract_roi_with_alpha(image, contour, padding=10):
117
+ x, y, w, h = cv2.boundingRect(contour)
118
+ x1 = max(0, x - padding)
119
+ y1 = max(0, y - padding)
120
+ x2 = min(image.shape[1], x + w + padding)
121
+ y2 = min(image.shape[0], y + h + padding)
122
+
123
+ roi = image[y1:y2, x1:x2].copy()
124
+ mask = np.zeros((y2-y1, x2-x1), dtype=np.uint8)
125
+
126
+ shifted_contour = contour - np.array([[x1, y1]])
127
+ cv2.drawContours(mask, [shifted_contour], -1, 255, -1)
128
+
129
+ if len(roi.shape) == 2:
130
+ roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR)
131
+
132
+ b, g, r = cv2.split(roi)
133
+ rgba = cv2.merge([b, g, r, mask])
134
+
135
+ return rgba
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: jsrc
3
+ Version: 0.1.0
4
+ Summary: Bioinformatics and phenotype analysis toolkit
5
+ Author-email: Jiaoyuan <your.email@example.com>
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: biopython>=1.80
11
+ Requires-Dist: matplotlib>=3.5
12
+ Requires-Dist: numpy>=1.20
13
+ Requires-Dist: opencv-python>=4.5
14
+ Requires-Dist: pandas>=1.3
15
+ Requires-Dist: pillow>=9.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7.0; extra == "dev"
18
+ Requires-Dist: black>=22.0; extra == "dev"
19
+ Requires-Dist: ruff>=0.1; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # jsrc
23
+
24
+ Bioinformatics and phenotype analysis toolkit
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install jsrc
30
+ ```
31
+
32
+ Or install from source:
33
+
34
+ ```bash
35
+ git clone https://github.com/imjiaoyuan/jsrc.git
36
+ cd jsrc
37
+ pip install -e .
38
+ ```
39
+
40
+ ## Commands
41
+
42
+ **Sequence Operations**
43
+
44
+ ```bash
45
+ jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
46
+ jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
47
+ jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
48
+ jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
49
+ ```
50
+
51
+ **Visualization**
52
+
53
+ ```bash
54
+ jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
55
+ jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
56
+ jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
57
+ jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
58
+ jsrc plot cis-element -bed elements.bed -o cis_elements.png
59
+ ```
60
+
61
+ **Analysis Tools**
62
+
63
+ ```bash
64
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
65
+ jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
66
+ jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
67
+ ```
68
+
69
+ **Phenotype Image Analysis**
70
+
71
+ ```bash
72
+ jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
73
+ jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
74
+ jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
75
+ jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
76
+ ```
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/analyze.py
5
+ src/cli.py
6
+ src/common.py
7
+ src/pheno.py
8
+ src/plot.py
9
+ src/seq.py
10
+ src/jsrc.egg-info/PKG-INFO
11
+ src/jsrc.egg-info/SOURCES.txt
12
+ src/jsrc.egg-info/dependency_links.txt
13
+ src/jsrc.egg-info/entry_points.txt
14
+ src/jsrc.egg-info/requires.txt
15
+ src/jsrc.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ jsrc = cli:main
@@ -0,0 +1,11 @@
1
+ biopython>=1.80
2
+ matplotlib>=3.5
3
+ numpy>=1.20
4
+ opencv-python>=4.5
5
+ pandas>=1.3
6
+ pillow>=9.0
7
+
8
+ [dev]
9
+ pytest>=7.0
10
+ black>=22.0
11
+ ruff>=0.1
@@ -0,0 +1,6 @@
1
+ analyze
2
+ cli
3
+ common
4
+ pheno
5
+ plot
6
+ seq
@@ -0,0 +1,142 @@
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ from common import apply_morphology, filter_contours, extract_roi_with_alpha
6
+
7
+ def cmd_split_fruit(args):
8
+ os.makedirs(args.o, exist_ok=True)
9
+
10
+ img = cv2.imread(args.i)
11
+ if img is None:
12
+ print(f"Error: Cannot read image {args.i}")
13
+ return
14
+
15
+ h, w = img.shape[:2]
16
+ scale = args.size / max(h, w)
17
+ new_h, new_w = int(h * scale), int(w * scale)
18
+ img_resized = cv2.resize(img, (new_w, new_h))
19
+
20
+ hsv = cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV)
21
+
22
+ lower = np.array([0, 30, 30])
23
+ upper = np.array([180, 255, 255])
24
+ mask = cv2.inRange(hsv, lower, upper)
25
+
26
+ mask = apply_morphology(mask, 'close', 15)
27
+ mask = apply_morphology(mask, 'open', 10)
28
+
29
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
30
+
31
+ contours = filter_contours(contours, min_area=500)
32
+
33
+ for i, cnt in enumerate(contours):
34
+ roi = extract_roi_with_alpha(img_resized, cnt, padding=20)
35
+
36
+ out_path = os.path.join(args.o, f'fruit_{i:03d}.png')
37
+ Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
38
+
39
+ print(f"Extracted {len(contours)} fruits to {args.o}")
40
+
41
+ def cmd_split_fruit_raw(args):
42
+ os.makedirs(args.o, exist_ok=True)
43
+
44
+ img = cv2.imread(args.i)
45
+ if img is None:
46
+ print(f"Error: Cannot read image {args.i}")
47
+ return
48
+
49
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
50
+
51
+ lower = np.array([0, 30, 30])
52
+ upper = np.array([180, 255, 255])
53
+ mask = cv2.inRange(hsv, lower, upper)
54
+
55
+ mask = apply_morphology(mask, 'close', 25)
56
+ mask = apply_morphology(mask, 'open', 15)
57
+
58
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
59
+
60
+ contours = filter_contours(contours, min_area=2000)
61
+
62
+ for i, cnt in enumerate(contours):
63
+ roi = extract_roi_with_alpha(img, cnt, padding=30)
64
+
65
+ out_path = os.path.join(args.o, f'fruit_raw_{i:03d}.png')
66
+ Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
67
+
68
+ print(f"Extracted {len(contours)} fruits (original size) to {args.o}")
69
+
70
+ def cmd_split_leaf(args):
71
+ os.makedirs(args.o, exist_ok=True)
72
+
73
+ img = cv2.imread(args.i)
74
+ if img is None:
75
+ print(f"Error: Cannot read image {args.i}")
76
+ return
77
+
78
+ h, w = img.shape[:2]
79
+ scale = args.size / max(h, w)
80
+ new_h, new_w = int(h * scale), int(w * scale)
81
+ img_resized = cv2.resize(img, (new_w, new_h))
82
+
83
+ hsv = cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV)
84
+
85
+ lower = np.array([35, 40, 40])
86
+ upper = np.array([85, 255, 255])
87
+ mask = cv2.inRange(hsv, lower, upper)
88
+
89
+ mask = apply_morphology(mask, 'close', 12)
90
+ mask = apply_morphology(mask, 'open', 8)
91
+
92
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
93
+
94
+ contours = filter_contours(contours, min_area=400)
95
+
96
+ for i, cnt in enumerate(contours):
97
+ roi = extract_roi_with_alpha(img_resized, cnt, padding=15)
98
+
99
+ out_path = os.path.join(args.o, f'leaf_{i:03d}.png')
100
+ Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
101
+
102
+ print(f"Extracted {len(contours)} leaves to {args.o}")
103
+
104
+ def cmd_split_leaf_edge(args):
105
+ os.makedirs(args.o, exist_ok=True)
106
+
107
+ img = cv2.imread(args.i)
108
+ if img is None:
109
+ print(f"Error: Cannot read image {args.i}")
110
+ return
111
+
112
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
113
+
114
+ blurred = cv2.GaussianBlur(gray, (5, 5), 0)
115
+
116
+ edges = cv2.Canny(blurred, 50, 150)
117
+
118
+ edges = apply_morphology(edges, 'dilate', 3)
119
+ edges = apply_morphology(edges, 'close', 5)
120
+
121
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+
123
+ contours = filter_contours(contours, min_area=500)
124
+
125
+ for i, cnt in enumerate(contours):
126
+ mask = np.zeros(img.shape[:2], dtype=np.uint8)
127
+ cv2.drawContours(mask, [cnt], -1, 255, 2)
128
+
129
+ x, y, w, h = cv2.boundingRect(cnt)
130
+ x1, y1 = max(0, x-20), max(0, y-20)
131
+ x2, y2 = min(img.shape[1], x+w+20), min(img.shape[0], y+h+20)
132
+
133
+ roi_edge = mask[y1:y2, x1:x2]
134
+ roi_img = img[y1:y2, x1:x2]
135
+
136
+ b, g, r = cv2.split(roi_img)
137
+ rgba = cv2.merge([b, g, r, roi_edge])
138
+
139
+ out_path = os.path.join(args.o, f'leaf_edge_{i:03d}.png')
140
+ Image.fromarray(cv2.cvtColor(rgba, cv2.COLOR_BGRA2RGBA)).save(out_path)
141
+
142
+ print(f"Extracted {len(contours)} leaf edges to {args.o}")
jsrc-0.1.0/src/plot.py ADDED
@@ -0,0 +1,210 @@
1
+ import sys
2
+ import pandas as pd
3
+ from common import setup_matplotlib, get_gene_structure, parse_gff_attributes, natural_sort_key
4
+ plt = setup_matplotlib()
5
+
6
+ def cmd_gene_structure(args):
7
+ with open(args.ids, 'r') as f:
8
+ gene_ids = [line.strip() for line in f if line.strip()]
9
+
10
+ coords = get_gene_structure(args.gff, gene_ids, feature_types=['CDS'])
11
+
12
+ gene_ids_sorted = sorted(gene_ids, key=natural_sort_key)
13
+
14
+ fig, ax = plt.subplots(figsize=(12, max(6, len(gene_ids_sorted) * 0.5)))
15
+
16
+ for i, gid in enumerate(gene_ids_sorted):
17
+ y = len(gene_ids_sorted) - i - 1
18
+ if gid not in coords or not coords[gid]:
19
+ continue
20
+
21
+ cds_list = sorted(coords[gid])
22
+ gstart = min(c[0] for c in cds_list)
23
+ gend = max(c[1] for c in cds_list)
24
+
25
+ ax.plot([gstart, gend], [y, y], 'k-', linewidth=1)
26
+
27
+ for start, end in cds_list:
28
+ ax.add_patch(plt.Rectangle((start, y-0.15), end-start, 0.3, facecolor='steelblue', edgecolor='black'))
29
+
30
+ ax.set_yticks(range(len(gene_ids_sorted)))
31
+ ax.set_yticklabels(gene_ids_sorted[::-1])
32
+ ax.set_xlabel('Genomic Position')
33
+ ax.set_title('Gene Structure')
34
+
35
+ plt.tight_layout()
36
+ plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
37
+ plt.close()
38
+ print(f"Gene structure plot saved to {args.o}")
39
+
40
+ def cmd_exon_structure(args):
41
+ with open(args.ids, 'r') as f:
42
+ gene_ids = [line.strip() for line in f if line.strip()]
43
+
44
+ coords = get_gene_structure(args.gff, gene_ids, feature_types=['exon'])
45
+
46
+ gene_ids_sorted = sorted(gene_ids, key=natural_sort_key)
47
+
48
+ fig, ax = plt.subplots(figsize=(12, max(6, len(gene_ids_sorted) * 0.5)))
49
+
50
+ for i, gid in enumerate(gene_ids_sorted):
51
+ y = len(gene_ids_sorted) - i - 1
52
+ if gid not in coords or not coords[gid]:
53
+ continue
54
+
55
+ exon_list = sorted(coords[gid])
56
+ gstart = min(c[0] for c in exon_list)
57
+ gend = max(c[1] for c in exon_list)
58
+
59
+ ax.plot([gstart, gend], [y, y], 'k-', linewidth=1)
60
+
61
+ for start, end in exon_list:
62
+ ax.add_patch(plt.Rectangle((start, y-0.2), end-start, 0.4, facecolor='green', edgecolor='black'))
63
+
64
+ ax.set_yticks(range(len(gene_ids_sorted)))
65
+ ax.set_yticklabels(gene_ids_sorted[::-1])
66
+ ax.set_xlabel('Genomic Position')
67
+ ax.set_title('Exon Structure')
68
+
69
+ plt.tight_layout()
70
+ plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
71
+ plt.close()
72
+ print(f"Exon structure plot saved to {args.o}")
73
+
74
+ def cmd_chromosome_map(args):
75
+ chr_lengths = {}
76
+ gene_positions = []
77
+
78
+ with open(args.gff, 'r') as f:
79
+ for line in f:
80
+ if line.startswith('##sequence-region'):
81
+ parts = line.strip().split()
82
+ if len(parts) >= 4:
83
+ chr_name = parts[1]
84
+ chr_len = int(parts[3])
85
+ chr_lengths[chr_name] = chr_len
86
+ elif not line.startswith('#'):
87
+ parts = line.strip().split('\t')
88
+ if len(parts) >= 9 and parts[2] == 'gene':
89
+ chrom = parts[0]
90
+ start = int(parts[3])
91
+ end = int(parts[4])
92
+
93
+ attr = parse_gff_attributes(parts[8])
94
+ gene_id = attr.get('ID', '')
95
+
96
+ gene_positions.append({
97
+ 'chr': chrom,
98
+ 'start': start,
99
+ 'end': end,
100
+ 'id': gene_id
101
+ })
102
+
103
+ if chrom not in chr_lengths:
104
+ chr_lengths[chrom] = max(chr_lengths.get(chrom, 0), end)
105
+
106
+ chr_sorted = sorted(chr_lengths.keys(), key=natural_sort_key)
107
+
108
+ fig, ax = plt.subplots(figsize=(12, max(6, len(chr_sorted) * 0.5)))
109
+
110
+ for i, chrom in enumerate(chr_sorted):
111
+ y = len(chr_sorted) - i - 1
112
+ chr_len = chr_lengths[chrom]
113
+
114
+ ax.add_patch(plt.Rectangle((0, y-0.2), chr_len, 0.4, facecolor='lightgray', edgecolor='black'))
115
+
116
+ chr_genes = [g for g in gene_positions if g['chr'] == chrom]
117
+ for gene in chr_genes:
118
+ mid = (gene['start'] + gene['end']) / 2
119
+ ax.plot([mid, mid], [y-0.15, y+0.15], 'r-', linewidth=0.5, alpha=0.5)
120
+
121
+ ax.set_yticks(range(len(chr_sorted)))
122
+ ax.set_yticklabels(chr_sorted[::-1])
123
+ ax.set_xlabel('Position (bp)')
124
+ ax.set_title('Chromosome Map')
125
+
126
+ plt.tight_layout()
127
+ plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
128
+ plt.close()
129
+ print(f"Chromosome map saved to {args.o}")
130
+
131
+ def cmd_protein_domain(args):
132
+ df = pd.read_csv(args.tsv, sep='\t', comment='#')
133
+
134
+ required_cols = ['protein', 'domain', 'start', 'end']
135
+ if not all(col in df.columns for col in required_cols):
136
+ print(f"Error: TSV must have columns: {', '.join(required_cols)}", file=sys.stderr)
137
+ sys.exit(1)
138
+
139
+ proteins = sorted(df['protein'].unique(), key=natural_sort_key)
140
+
141
+ fig, ax = plt.subplots(figsize=(12, max(6, len(proteins) * 0.5)))
142
+
143
+ for i, prot in enumerate(proteins):
144
+ y = len(proteins) - i - 1
145
+ prot_domains = df[df['protein'] == prot]
146
+
147
+ max_pos = prot_domains['end'].max()
148
+ ax.plot([0, max_pos], [y, y], 'k-', linewidth=2)
149
+
150
+ for _, row in prot_domains.iterrows():
151
+ start = row['start']
152
+ end = row['end']
153
+ domain = row['domain']
154
+
155
+ ax.add_patch(plt.Rectangle((start, y-0.2), end-start, 0.4,
156
+ facecolor='orange', edgecolor='black', alpha=0.7))
157
+ ax.text((start+end)/2, y, domain, ha='center', va='center', fontsize=8)
158
+
159
+ ax.set_yticks(range(len(proteins)))
160
+ ax.set_yticklabels(proteins[::-1])
161
+ ax.set_xlabel('Position (aa)')
162
+ ax.set_title('Protein Domain Architecture')
163
+
164
+ plt.tight_layout()
165
+ plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
166
+ plt.close()
167
+ print(f"Protein domain plot saved to {args.o}")
168
+
169
+ def cmd_cis_element(args):
170
+ elements = []
171
+
172
+ with open(args.bed, 'r') as f:
173
+ for line in f:
174
+ if line.startswith('#') or not line.strip():
175
+ continue
176
+ parts = line.strip().split('\t')
177
+ if len(parts) >= 4:
178
+ elements.append({
179
+ 'chr': parts[0],
180
+ 'start': int(parts[1]),
181
+ 'end': int(parts[2]),
182
+ 'name': parts[3]
183
+ })
184
+
185
+ chromosomes = sorted(set(e['chr'] for e in elements), key=natural_sort_key)
186
+
187
+ fig, ax = plt.subplots(figsize=(12, max(6, len(chromosomes) * 0.5)))
188
+
189
+ for i, chrom in enumerate(chromosomes):
190
+ y = len(chromosomes) - i - 1
191
+ chr_elements = [e for e in elements if e['chr'] == chrom]
192
+
193
+ if chr_elements:
194
+ max_pos = max(e['end'] for e in chr_elements)
195
+ ax.plot([0, max_pos], [y, y], 'k-', linewidth=1)
196
+
197
+ for elem in chr_elements:
198
+ mid = (elem['start'] + elem['end']) / 2
199
+ ax.plot([mid, mid], [y-0.3, y+0.3], 'b-', linewidth=2)
200
+ ax.text(mid, y+0.35, elem['name'], ha='center', fontsize=7, rotation=45)
201
+
202
+ ax.set_yticks(range(len(chromosomes)))
203
+ ax.set_yticklabels(chromosomes[::-1])
204
+ ax.set_xlabel('Position (bp)')
205
+ ax.set_title('Cis-regulatory Elements')
206
+
207
+ plt.tight_layout()
208
+ plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
209
+ plt.close()
210
+ print(f"Cis-element plot saved to {args.o}")
jsrc-0.1.0/src/seq.py ADDED
@@ -0,0 +1,163 @@
1
+ import sys
2
+ import csv
3
+ import re
4
+ from Bio import SeqIO
5
+ from Bio.Seq import Seq
6
+ from Bio.SeqRecord import SeqRecord
7
+
8
+ def cmd_extract(args):
9
+ with open(args.ids, 'r') as f:
10
+ raw_ids = [line.strip() for line in f if line.strip()]
11
+ target_ids = {rid.lower().split('.')[0]: rid for rid in raw_ids}
12
+
13
+ found_records = {}
14
+
15
+ with open(args.fa, 'r') as infile:
16
+ current_header = None
17
+ current_seq_lines = []
18
+
19
+ def save_record(header, seq_lines):
20
+ if not header:
21
+ return
22
+ full_header = header[1:]
23
+ first_word = full_header.split()[0]
24
+
25
+ clean_id = first_word.lower().split('.')[0]
26
+ seq = ''.join(seq_lines)
27
+
28
+ if clean_id in target_ids:
29
+ if clean_id not in found_records or len(seq) > len(found_records[clean_id]['seq']):
30
+ found_records[clean_id] = {
31
+ 'header': target_ids[clean_id],
32
+ 'seq': seq
33
+ }
34
+
35
+ for line in infile:
36
+ line = line.strip()
37
+ if not line:
38
+ continue
39
+ if line.startswith('>'):
40
+ save_record(current_header, current_seq_lines)
41
+ current_header = line
42
+ current_seq_lines = []
43
+ else:
44
+ current_seq_lines.append(line)
45
+
46
+ save_record(current_header, current_seq_lines)
47
+
48
+ with open(args.o, 'w') as outfile:
49
+ for clean_id in target_ids:
50
+ if clean_id in found_records:
51
+ rec = found_records[clean_id]
52
+ outfile.write(f">{rec['header']}\n{rec['seq']}\n")
53
+
54
+ print(f"Extracted {len(found_records)}/{len(target_ids)} sequences to {args.o}")
55
+
56
+ def cmd_rename(args):
57
+ mapping = {}
58
+ with open(args.map, 'r') as f:
59
+ reader = csv.reader(f)
60
+ for row in reader:
61
+ if len(row) >= 2:
62
+ mapping[row[0].strip()] = row[1].strip()
63
+
64
+ with open(args.fa, 'r') as fin, open(args.o, 'w') as fout:
65
+ for line in fin:
66
+ if line.startswith('>'):
67
+ old_id = line[1:].split()[0]
68
+ if old_id in mapping:
69
+ fout.write(f">{mapping[old_id]}\n")
70
+ else:
71
+ fout.write(line)
72
+ else:
73
+ fout.write(line)
74
+
75
+ print(f"Renamed {len(mapping)} sequences to {args.o}")
76
+
77
+ def cmd_rename_by_gff(args):
78
+ from common import parse_gff_attributes
79
+
80
+ mapping = {}
81
+
82
+ with open(args.gff, 'r') as f:
83
+ for line in f:
84
+ if line.startswith('#'):
85
+ continue
86
+ parts = line.strip().split('\t')
87
+ if len(parts) < 9:
88
+ continue
89
+
90
+ if parts[2] == 'mRNA':
91
+ attr = parse_gff_attributes(parts[8])
92
+ tid = attr.get('ID')
93
+ pid = attr.get(args.parent)
94
+ if tid and pid:
95
+ mapping[tid] = pid
96
+
97
+ with open(args.fa, 'r') as fin, open(args.o, 'w') as fout:
98
+ for line in fin:
99
+ if line.startswith('>'):
100
+ old_id = line[1:].split()[0]
101
+ if old_id in mapping:
102
+ fout.write(f">{mapping[old_id]}\n")
103
+ else:
104
+ fout.write(line)
105
+ else:
106
+ fout.write(line)
107
+
108
+ print(f"Renamed {len(mapping)} transcripts to {args.o}")
109
+
110
+ def cmd_translate(args):
111
+ from common import parse_gff_attributes
112
+
113
+ genome = SeqIO.to_dict(SeqIO.parse(args.fa, 'fasta'))
114
+
115
+ cds_dict = {}
116
+
117
+ with open(args.gff, 'r') as f:
118
+ for line in f:
119
+ if line.startswith('#'):
120
+ continue
121
+ parts = line.strip().split('\t')
122
+ if len(parts) < 9 or parts[2] != 'CDS':
123
+ continue
124
+
125
+ chrom = parts[0]
126
+ start = int(parts[3]) - 1
127
+ end = int(parts[4])
128
+ strand = parts[6]
129
+
130
+ attr = parse_gff_attributes(parts[8])
131
+ gene_id = attr.get(args.id)
132
+
133
+ if not gene_id or chrom not in genome:
134
+ continue
135
+
136
+ if gene_id not in cds_dict:
137
+ cds_dict[gene_id] = {'chrom': chrom, 'strand': strand, 'regions': []}
138
+
139
+ cds_dict[gene_id]['regions'].append((start, end))
140
+
141
+ proteins = []
142
+
143
+ for gene_id, data in cds_dict.items():
144
+ chrom_seq = genome[data['chrom']].seq
145
+
146
+ regions = sorted(data['regions'])
147
+ cds_seq = Seq('')
148
+ for start, end in regions:
149
+ cds_seq += chrom_seq[start:end]
150
+
151
+ if data['strand'] == '-':
152
+ cds_seq = cds_seq.reverse_complement()
153
+
154
+ try:
155
+ protein_seq = cds_seq.translate(to_stop=True)
156
+ if len(protein_seq) > 0:
157
+ proteins.append(SeqRecord(protein_seq, id=gene_id, description=''))
158
+ except Exception as e:
159
+ print(f"Warning: Failed to translate {gene_id}: {e}", file=sys.stderr)
160
+ continue
161
+
162
+ SeqIO.write(proteins, args.o, 'fasta')
163
+ print(f"Translated {len(proteins)} genes to {args.o}")