jsrc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jsrc-0.1.0/LICENSE +21 -0
- jsrc-0.1.0/PKG-INFO +76 -0
- jsrc-0.1.0/README.md +55 -0
- jsrc-0.1.0/pyproject.toml +35 -0
- jsrc-0.1.0/setup.cfg +4 -0
- jsrc-0.1.0/src/analyze.py +56 -0
- jsrc-0.1.0/src/cli.py +140 -0
- jsrc-0.1.0/src/common.py +135 -0
- jsrc-0.1.0/src/jsrc.egg-info/PKG-INFO +76 -0
- jsrc-0.1.0/src/jsrc.egg-info/SOURCES.txt +15 -0
- jsrc-0.1.0/src/jsrc.egg-info/dependency_links.txt +1 -0
- jsrc-0.1.0/src/jsrc.egg-info/entry_points.txt +2 -0
- jsrc-0.1.0/src/jsrc.egg-info/requires.txt +11 -0
- jsrc-0.1.0/src/jsrc.egg-info/top_level.txt +6 -0
- jsrc-0.1.0/src/pheno.py +142 -0
- jsrc-0.1.0/src/plot.py +210 -0
- jsrc-0.1.0/src/seq.py +163 -0
jsrc-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JiaoYuan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
jsrc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jsrc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Bioinformatics and phenotype analysis toolkit
|
|
5
|
+
Author-email: Jiaoyuan <your.email@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: biopython>=1.80
|
|
11
|
+
Requires-Dist: matplotlib>=3.5
|
|
12
|
+
Requires-Dist: numpy>=1.20
|
|
13
|
+
Requires-Dist: opencv-python>=4.5
|
|
14
|
+
Requires-Dist: pandas>=1.3
|
|
15
|
+
Requires-Dist: pillow>=9.0
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
18
|
+
Requires-Dist: black>=22.0; extra == "dev"
|
|
19
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# jsrc
|
|
23
|
+
|
|
24
|
+
Bioinformatics and phenotype analysis toolkit
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install jsrc
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Or install from source:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/imjiaoyuan/jsrc.git
|
|
36
|
+
cd jsrc
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Commands
|
|
41
|
+
|
|
42
|
+
**Sequence Operations**
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
|
|
46
|
+
jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
|
|
47
|
+
jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
|
|
48
|
+
jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Visualization**
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
|
|
55
|
+
jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
|
|
56
|
+
jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
|
|
57
|
+
jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
|
|
58
|
+
jsrc plot cis-element -bed elements.bed -o cis_elements.png
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Analysis Tools**
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
|
|
65
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
|
|
66
|
+
jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Phenotype Image Analysis**
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
|
|
73
|
+
jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
|
|
74
|
+
jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
|
|
75
|
+
jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
|
|
76
|
+
```
|
jsrc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# jsrc
|
|
2
|
+
|
|
3
|
+
Bioinformatics and phenotype analysis toolkit
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install jsrc
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Or install from source:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
git clone https://github.com/imjiaoyuan/jsrc.git
|
|
15
|
+
cd jsrc
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Commands
|
|
20
|
+
|
|
21
|
+
**Sequence Operations**
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
|
|
25
|
+
jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
|
|
26
|
+
jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
|
|
27
|
+
jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Visualization**
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
|
|
34
|
+
jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
|
|
35
|
+
jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
|
|
36
|
+
jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
|
|
37
|
+
jsrc plot cis-element -bed elements.bed -o cis_elements.png
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Analysis Tools**
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
|
|
44
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
|
|
45
|
+
jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Phenotype Image Analysis**
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
|
|
52
|
+
jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
|
|
53
|
+
jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
|
|
54
|
+
jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
|
|
55
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "jsrc"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Bioinformatics and phenotype analysis toolkit"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Jiaoyuan", email = "your.email@example.com"}
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
dependencies = [
|
|
13
|
+
"biopython>=1.80",
|
|
14
|
+
"matplotlib>=3.5",
|
|
15
|
+
"numpy>=1.20",
|
|
16
|
+
"opencv-python>=4.5",
|
|
17
|
+
"pandas>=1.3",
|
|
18
|
+
"pillow>=9.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = ["pytest>=7.0", "black>=22.0", "ruff>=0.1"]
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
jsrc = "cli:main"
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
29
|
+
build-backend = "setuptools.build_meta"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools]
|
|
32
|
+
py-modules = ["cli", "common", "seq", "plot", "analyze", "pheno"]
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.package-dir]
|
|
35
|
+
"" = "src"
|
jsrc-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from Bio import SeqIO, AlignIO, Phylo
|
|
5
|
+
from common import sanitize_fasta_ids, check_external_tool
|
|
6
|
+
|
|
7
|
+
def cmd_phylo_tree(args):
|
|
8
|
+
check_external_tool('mafft', 'conda install -c bioconda mafft')
|
|
9
|
+
|
|
10
|
+
if args.a == 'ml':
|
|
11
|
+
check_external_tool('FastTree', 'conda install -c bioconda fasttree')
|
|
12
|
+
|
|
13
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
14
|
+
clean_fa = os.path.join(tmpdir, 'clean.fa')
|
|
15
|
+
aln_fa = os.path.join(tmpdir, 'aligned.fa')
|
|
16
|
+
|
|
17
|
+
id_map = sanitize_fasta_ids(args.fa, clean_fa)
|
|
18
|
+
|
|
19
|
+
os.system(f'mafft --auto --quiet {clean_fa} > {aln_fa}')
|
|
20
|
+
|
|
21
|
+
if args.a == 'nj':
|
|
22
|
+
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
|
|
23
|
+
|
|
24
|
+
alignment = AlignIO.read(aln_fa, 'fasta')
|
|
25
|
+
calculator = DistanceCalculator('identity')
|
|
26
|
+
dm = calculator.get_distance(alignment)
|
|
27
|
+
constructor = DistanceTreeConstructor(calculator)
|
|
28
|
+
tree = constructor.nj(dm)
|
|
29
|
+
|
|
30
|
+
else:
|
|
31
|
+
tree_file = os.path.join(tmpdir, 'tree.nwk')
|
|
32
|
+
os.system(f'FastTree -nt -quiet {aln_fa} > {tree_file}')
|
|
33
|
+
tree = Phylo.read(tree_file, 'newick')
|
|
34
|
+
|
|
35
|
+
for clade in tree.find_clades():
|
|
36
|
+
if clade.name and clade.name in id_map:
|
|
37
|
+
clade.name = id_map[clade.name]
|
|
38
|
+
|
|
39
|
+
Phylo.write(tree, args.o, 'newick')
|
|
40
|
+
|
|
41
|
+
print(f"Phylogenetic tree ({args.a}) saved to {args.o}")
|
|
42
|
+
|
|
43
|
+
def cmd_motif(args):
|
|
44
|
+
check_external_tool('meme', 'conda install -c bioconda meme')
|
|
45
|
+
|
|
46
|
+
os.makedirs(args.o, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
cmd = f'meme {args.fa} -dna -oc {args.o} -nmotifs {args.nmotifs} -minw {args.minw} -maxw {args.maxw} -mod zoops'
|
|
49
|
+
|
|
50
|
+
ret = os.system(cmd)
|
|
51
|
+
|
|
52
|
+
if ret != 0:
|
|
53
|
+
print(f"Error: MEME failed with exit code {ret}", file=sys.stderr)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
print(f"Motif analysis complete. Results in {args.o}")
|
jsrc-0.1.0/src/cli.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import seq, plot, analyze, pheno
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
parser = argparse.ArgumentParser(
|
|
7
|
+
prog='jsrc',
|
|
8
|
+
description='Bioinformatics and phenotype analysis toolkit'
|
|
9
|
+
)
|
|
10
|
+
parser.add_argument('--version', action='version', version='1.0.0')
|
|
11
|
+
|
|
12
|
+
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
13
|
+
|
|
14
|
+
seq_parser = subparsers.add_parser('seq', help='Sequence operations')
|
|
15
|
+
seq_sub = seq_parser.add_subparsers(dest='seq_cmd')
|
|
16
|
+
|
|
17
|
+
p = seq_sub.add_parser('extract', help='Extract sequences by ID list')
|
|
18
|
+
p.add_argument('-fa', required=True, help='Input FASTA file')
|
|
19
|
+
p.add_argument('-ids', required=True, help='ID list file (one per line)')
|
|
20
|
+
p.add_argument('-o', required=True, help='Output FASTA file')
|
|
21
|
+
p.set_defaults(func=seq.cmd_extract)
|
|
22
|
+
|
|
23
|
+
p = seq_sub.add_parser('rename', help='Rename sequences using mapping file')
|
|
24
|
+
p.add_argument('-fa', required=True, help='Input FASTA file')
|
|
25
|
+
p.add_argument('-map', required=True, help='ID mapping file (CSV: old,new)')
|
|
26
|
+
p.add_argument('-o', required=True, help='Output FASTA file')
|
|
27
|
+
p.set_defaults(func=seq.cmd_rename)
|
|
28
|
+
|
|
29
|
+
p = seq_sub.add_parser('rename-by-gff', help='Rename sequences based on GFF')
|
|
30
|
+
p.add_argument('-fa', required=True, help='Input FASTA file')
|
|
31
|
+
p.add_argument('-gff', required=True, help='GFF annotation file')
|
|
32
|
+
p.add_argument('-parent', required=True, help='Parent attribute field name')
|
|
33
|
+
p.add_argument('-o', required=True, help='Output FASTA file')
|
|
34
|
+
p.set_defaults(func=seq.cmd_rename_by_gff)
|
|
35
|
+
|
|
36
|
+
p = seq_sub.add_parser('translate', help='Extract CDS and translate to protein')
|
|
37
|
+
p.add_argument('-fa', required=True, help='Genome FASTA file')
|
|
38
|
+
p.add_argument('-gff', required=True, help='GFF annotation file')
|
|
39
|
+
p.add_argument('-id', required=True, help='Gene ID field in GFF')
|
|
40
|
+
p.add_argument('-o', required=True, help='Output protein FASTA')
|
|
41
|
+
p.set_defaults(func=seq.cmd_translate)
|
|
42
|
+
|
|
43
|
+
plot_parser = subparsers.add_parser('plot', help='Visualization')
|
|
44
|
+
plot_sub = plot_parser.add_subparsers(dest='plot_cmd')
|
|
45
|
+
|
|
46
|
+
p = plot_sub.add_parser('gene-structure', help='Plot gene structure diagram')
|
|
47
|
+
p.add_argument('-gff', required=True, help='GFF annotation file')
|
|
48
|
+
p.add_argument('-ids', required=True, help='Gene ID list file')
|
|
49
|
+
p.add_argument('-o', required=True, help='Output PNG file')
|
|
50
|
+
p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
|
|
51
|
+
p.set_defaults(func=plot.cmd_gene_structure)
|
|
52
|
+
|
|
53
|
+
p = plot_sub.add_parser('exon-structure', help='Plot exon structure')
|
|
54
|
+
p.add_argument('-gff', required=True, help='GFF annotation file')
|
|
55
|
+
p.add_argument('-ids', required=True, help='Gene ID list file')
|
|
56
|
+
p.add_argument('-o', required=True, help='Output PNG file')
|
|
57
|
+
p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
|
|
58
|
+
p.set_defaults(func=plot.cmd_exon_structure)
|
|
59
|
+
|
|
60
|
+
p = plot_sub.add_parser('chromosome-map', help='Plot chromosome map')
|
|
61
|
+
p.add_argument('-gff', required=True, help='GFF annotation file')
|
|
62
|
+
p.add_argument('-o', required=True, help='Output PNG file')
|
|
63
|
+
p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
|
|
64
|
+
p.set_defaults(func=plot.cmd_chromosome_map)
|
|
65
|
+
|
|
66
|
+
p = plot_sub.add_parser('protein-domain', help='Plot protein domain architecture')
|
|
67
|
+
p.add_argument('-tsv', required=True, help='Domain TSV file')
|
|
68
|
+
p.add_argument('-o', required=True, help='Output PNG file')
|
|
69
|
+
p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
|
|
70
|
+
p.set_defaults(func=plot.cmd_protein_domain)
|
|
71
|
+
|
|
72
|
+
p = plot_sub.add_parser('cis-element', help='Plot cis-regulatory elements')
|
|
73
|
+
p.add_argument('-bed', required=True, help='BED file with elements')
|
|
74
|
+
p.add_argument('-o', required=True, help='Output PNG file')
|
|
75
|
+
p.add_argument('-dpi', type=int, default=300, help='DPI (default: 300)')
|
|
76
|
+
p.set_defaults(func=plot.cmd_cis_element)
|
|
77
|
+
|
|
78
|
+
analyze_parser = subparsers.add_parser('analyze', help='Analysis tools')
|
|
79
|
+
analyze_sub = analyze_parser.add_subparsers(dest='analyze_cmd')
|
|
80
|
+
|
|
81
|
+
p = analyze_sub.add_parser('phylo-tree', help='Build phylogenetic tree')
|
|
82
|
+
p.add_argument('-fa', required=True, help='Input FASTA file')
|
|
83
|
+
p.add_argument('-o', required=True, help='Output tree file')
|
|
84
|
+
p.add_argument('-a', choices=['nj', 'ml'], default='nj', help='Algorithm (default: nj)')
|
|
85
|
+
p.set_defaults(func=analyze.cmd_phylo_tree)
|
|
86
|
+
|
|
87
|
+
p = analyze_sub.add_parser('motif', help='Find motifs using MEME')
|
|
88
|
+
p.add_argument('-fa', required=True, help='Input FASTA file')
|
|
89
|
+
p.add_argument('-o', required=True, help='Output directory')
|
|
90
|
+
p.add_argument('-nmotifs', type=int, default=5, help='Number of motifs (default: 5)')
|
|
91
|
+
p.add_argument('-minw', type=int, default=6, help='Min motif width (default: 6)')
|
|
92
|
+
p.add_argument('-maxw', type=int, default=50, help='Max motif width (default: 50)')
|
|
93
|
+
p.set_defaults(func=analyze.cmd_motif)
|
|
94
|
+
|
|
95
|
+
pheno_parser = subparsers.add_parser('pheno', help='Phenotype image analysis')
|
|
96
|
+
pheno_sub = pheno_parser.add_subparsers(dest='pheno_cmd')
|
|
97
|
+
|
|
98
|
+
p = pheno_sub.add_parser('split-fruit', help='Segment fruit objects')
|
|
99
|
+
p.add_argument('-i', required=True, help='Input image file')
|
|
100
|
+
p.add_argument('-o', required=True, help='Output directory')
|
|
101
|
+
p.add_argument('-size', type=int, default=800, help='Target size (default: 800)')
|
|
102
|
+
p.set_defaults(func=pheno.cmd_split_fruit)
|
|
103
|
+
|
|
104
|
+
p = pheno_sub.add_parser('split-fruit-raw', help='Segment fruit without resizing')
|
|
105
|
+
p.add_argument('-i', required=True, help='Input image file')
|
|
106
|
+
p.add_argument('-o', required=True, help='Output directory')
|
|
107
|
+
p.set_defaults(func=pheno.cmd_split_fruit_raw)
|
|
108
|
+
|
|
109
|
+
p = pheno_sub.add_parser('split-leaf', help='Segment leaf objects')
|
|
110
|
+
p.add_argument('-i', required=True, help='Input image file')
|
|
111
|
+
p.add_argument('-o', required=True, help='Output directory')
|
|
112
|
+
p.add_argument('-size', type=int, default=800, help='Target size (default: 800)')
|
|
113
|
+
p.set_defaults(func=pheno.cmd_split_leaf)
|
|
114
|
+
|
|
115
|
+
p = pheno_sub.add_parser('split-leaf-edge', help='Extract leaf edges')
|
|
116
|
+
p.add_argument('-i', required=True, help='Input image file')
|
|
117
|
+
p.add_argument('-o', required=True, help='Output directory')
|
|
118
|
+
p.set_defaults(func=pheno.cmd_split_leaf_edge)
|
|
119
|
+
|
|
120
|
+
args = parser.parse_args()
|
|
121
|
+
|
|
122
|
+
if not args.command:
|
|
123
|
+
parser.print_help()
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
if hasattr(args, 'func'):
|
|
127
|
+
args.func(args)
|
|
128
|
+
else:
|
|
129
|
+
if args.command == 'seq' and not args.seq_cmd:
|
|
130
|
+
seq_parser.print_help()
|
|
131
|
+
elif args.command == 'plot' and not args.plot_cmd:
|
|
132
|
+
plot_parser.print_help()
|
|
133
|
+
elif args.command == 'analyze' and not args.analyze_cmd:
|
|
134
|
+
analyze_parser.print_help()
|
|
135
|
+
elif args.command == 'pheno' and not args.pheno_cmd:
|
|
136
|
+
pheno_parser.print_help()
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
if __name__ == '__main__':
|
|
140
|
+
main()
|
jsrc-0.1.0/src/common.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from typing import Dict, List, Tuple, Optional
|
|
5
|
+
import cv2
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
def parse_gff_attributes(attr_string: str) -> Dict[str, str]:
|
|
9
|
+
attrs = {}
|
|
10
|
+
for item in attr_string.strip().strip(';').split(';'):
|
|
11
|
+
if '=' in item:
|
|
12
|
+
key, value = item.strip().split('=', 1)
|
|
13
|
+
attrs[key] = value.strip('"')
|
|
14
|
+
elif ' ' in item:
|
|
15
|
+
parts = item.strip().split(None, 1)
|
|
16
|
+
if len(parts) == 2:
|
|
17
|
+
attrs[parts[0]] = parts[1].strip('"')
|
|
18
|
+
return attrs
|
|
19
|
+
|
|
20
|
+
def get_gene_structure(gff_file: str, gene_ids: List[str],
|
|
21
|
+
feature_types: Optional[List[str]] = None) -> Dict:
|
|
22
|
+
if feature_types is None:
|
|
23
|
+
feature_types = ['CDS', 'exon']
|
|
24
|
+
|
|
25
|
+
target_set = set(gene_ids)
|
|
26
|
+
valid_mrna = {}
|
|
27
|
+
coords = {tid: [] for tid in gene_ids}
|
|
28
|
+
|
|
29
|
+
with open(gff_file, 'r') as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
if line.startswith('#'):
|
|
32
|
+
continue
|
|
33
|
+
parts = line.strip().split('\t')
|
|
34
|
+
if len(parts) < 9:
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
ftype = parts[2]
|
|
38
|
+
attr = parse_gff_attributes(parts[8])
|
|
39
|
+
|
|
40
|
+
if ftype == 'mRNA':
|
|
41
|
+
pid = attr.get('Parent')
|
|
42
|
+
mid = attr.get('ID')
|
|
43
|
+
if pid in target_set:
|
|
44
|
+
valid_mrna[mid] = pid
|
|
45
|
+
|
|
46
|
+
elif ftype in feature_types:
|
|
47
|
+
pid = attr.get('Parent')
|
|
48
|
+
if pid in valid_mrna:
|
|
49
|
+
gid = valid_mrna[pid]
|
|
50
|
+
coords[gid].append((int(parts[3]), int(parts[4])))
|
|
51
|
+
elif pid in target_set:
|
|
52
|
+
coords[pid].append((int(parts[3]), int(parts[4])))
|
|
53
|
+
|
|
54
|
+
return coords
|
|
55
|
+
|
|
56
|
+
def read_fasta_ids(fasta_file: str) -> List[str]:
|
|
57
|
+
ids = []
|
|
58
|
+
with open(fasta_file, 'r') as f:
|
|
59
|
+
for line in f:
|
|
60
|
+
if line.startswith('>'):
|
|
61
|
+
ids.append(line[1:].split()[0])
|
|
62
|
+
return ids
|
|
63
|
+
|
|
64
|
+
def sanitize_fasta_ids(input_file: str, output_file: str) -> Dict[str, str]:
|
|
65
|
+
mapping = {}
|
|
66
|
+
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
|
|
67
|
+
for line in fin:
|
|
68
|
+
if line.startswith('>'):
|
|
69
|
+
old_id = line[1:].strip().split()[0]
|
|
70
|
+
new_id = re.sub(r'[^A-Za-z0-9_-]', '_', old_id)
|
|
71
|
+
mapping[new_id] = old_id
|
|
72
|
+
fout.write(f'>{new_id}\n')
|
|
73
|
+
else:
|
|
74
|
+
fout.write(line)
|
|
75
|
+
return mapping
|
|
76
|
+
|
|
77
|
+
def setup_matplotlib():
|
|
78
|
+
import matplotlib
|
|
79
|
+
matplotlib.use('Agg')
|
|
80
|
+
import matplotlib.pyplot as plt
|
|
81
|
+
return plt
|
|
82
|
+
|
|
83
|
+
def natural_sort_key(s):
|
|
84
|
+
return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
|
|
85
|
+
|
|
86
|
+
def check_external_tool(tool_name: str, install_hint: str = None):
|
|
87
|
+
if not shutil.which(tool_name):
|
|
88
|
+
msg = f"Error: {tool_name} not found in PATH"
|
|
89
|
+
if install_hint:
|
|
90
|
+
msg += f"\nInstall with: {install_hint}"
|
|
91
|
+
raise RuntimeError(msg)
|
|
92
|
+
|
|
93
|
+
def apply_morphology(mask, operation='close', kernel_size=5):
|
|
94
|
+
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
|
|
95
|
+
if operation == 'close':
|
|
96
|
+
return cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
|
|
97
|
+
elif operation == 'open':
|
|
98
|
+
return cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
|
|
99
|
+
elif operation == 'dilate':
|
|
100
|
+
return cv2.dilate(mask, kernel)
|
|
101
|
+
elif operation == 'erode':
|
|
102
|
+
return cv2.erode(mask, kernel)
|
|
103
|
+
return mask
|
|
104
|
+
|
|
105
|
+
def filter_contours(contours, min_area=100, max_area=None):
|
|
106
|
+
filtered = []
|
|
107
|
+
for cnt in contours:
|
|
108
|
+
area = cv2.contourArea(cnt)
|
|
109
|
+
if area < min_area:
|
|
110
|
+
continue
|
|
111
|
+
if max_area and area > max_area:
|
|
112
|
+
continue
|
|
113
|
+
filtered.append(cnt)
|
|
114
|
+
return filtered
|
|
115
|
+
|
|
116
|
+
def extract_roi_with_alpha(image, contour, padding=10):
|
|
117
|
+
x, y, w, h = cv2.boundingRect(contour)
|
|
118
|
+
x1 = max(0, x - padding)
|
|
119
|
+
y1 = max(0, y - padding)
|
|
120
|
+
x2 = min(image.shape[1], x + w + padding)
|
|
121
|
+
y2 = min(image.shape[0], y + h + padding)
|
|
122
|
+
|
|
123
|
+
roi = image[y1:y2, x1:x2].copy()
|
|
124
|
+
mask = np.zeros((y2-y1, x2-x1), dtype=np.uint8)
|
|
125
|
+
|
|
126
|
+
shifted_contour = contour - np.array([[x1, y1]])
|
|
127
|
+
cv2.drawContours(mask, [shifted_contour], -1, 255, -1)
|
|
128
|
+
|
|
129
|
+
if len(roi.shape) == 2:
|
|
130
|
+
roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR)
|
|
131
|
+
|
|
132
|
+
b, g, r = cv2.split(roi)
|
|
133
|
+
rgba = cv2.merge([b, g, r, mask])
|
|
134
|
+
|
|
135
|
+
return rgba
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jsrc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Bioinformatics and phenotype analysis toolkit
|
|
5
|
+
Author-email: Jiaoyuan <your.email@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: biopython>=1.80
|
|
11
|
+
Requires-Dist: matplotlib>=3.5
|
|
12
|
+
Requires-Dist: numpy>=1.20
|
|
13
|
+
Requires-Dist: opencv-python>=4.5
|
|
14
|
+
Requires-Dist: pandas>=1.3
|
|
15
|
+
Requires-Dist: pillow>=9.0
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
18
|
+
Requires-Dist: black>=22.0; extra == "dev"
|
|
19
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# jsrc
|
|
23
|
+
|
|
24
|
+
Bioinformatics and phenotype analysis toolkit
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install jsrc
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Or install from source:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/imjiaoyuan/jsrc.git
|
|
36
|
+
cd jsrc
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Commands
|
|
41
|
+
|
|
42
|
+
**Sequence Operations**
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
jsrc seq extract -fa genome.fa -ids ids.txt -o output.fa
|
|
46
|
+
jsrc seq rename -fa input.fa -map mapping.csv -o output.fa
|
|
47
|
+
jsrc seq rename-by-gff -fa transcripts.fa -gff genes.gff -parent Parent -o output.fa
|
|
48
|
+
jsrc seq translate -fa genome.fa -gff genes.gff -id ID -o proteins.fa
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Visualization**
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
jsrc plot gene-structure -gff genes.gff -ids genes.txt -o gene_structure.png
|
|
55
|
+
jsrc plot exon-structure -gff genes.gff -ids genes.txt -o exon_structure.png
|
|
56
|
+
jsrc plot chromosome-map -gff genes.gff -o chromosome_map.png
|
|
57
|
+
jsrc plot protein-domain -tsv domains.tsv -o protein_domains.png
|
|
58
|
+
jsrc plot cis-element -bed elements.bed -o cis_elements.png
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Analysis Tools**
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a nj
|
|
65
|
+
jsrc analyze phylo-tree -fa sequences.fa -o tree.nwk -a ml
|
|
66
|
+
jsrc analyze motif -fa promoters.fa -o motif_output -nmotifs 5
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Phenotype Image Analysis**
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
jsrc pheno split-fruit -i fruit_image.jpg -o output_dir
|
|
73
|
+
jsrc pheno split-fruit-raw -i fruit_image.jpg -o output_dir
|
|
74
|
+
jsrc pheno split-leaf -i leaf_image.jpg -o output_dir
|
|
75
|
+
jsrc pheno split-leaf-edge -i leaf_image.jpg -o output_dir
|
|
76
|
+
```
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/analyze.py
|
|
5
|
+
src/cli.py
|
|
6
|
+
src/common.py
|
|
7
|
+
src/pheno.py
|
|
8
|
+
src/plot.py
|
|
9
|
+
src/seq.py
|
|
10
|
+
src/jsrc.egg-info/PKG-INFO
|
|
11
|
+
src/jsrc.egg-info/SOURCES.txt
|
|
12
|
+
src/jsrc.egg-info/dependency_links.txt
|
|
13
|
+
src/jsrc.egg-info/entry_points.txt
|
|
14
|
+
src/jsrc.egg-info/requires.txt
|
|
15
|
+
src/jsrc.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
jsrc-0.1.0/src/pheno.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import cv2
|
|
3
|
+
import numpy as np
|
|
4
|
+
from PIL import Image
|
|
5
|
+
from common import apply_morphology, filter_contours, extract_roi_with_alpha
|
|
6
|
+
|
|
7
|
+
def cmd_split_fruit(args):
|
|
8
|
+
os.makedirs(args.o, exist_ok=True)
|
|
9
|
+
|
|
10
|
+
img = cv2.imread(args.i)
|
|
11
|
+
if img is None:
|
|
12
|
+
print(f"Error: Cannot read image {args.i}")
|
|
13
|
+
return
|
|
14
|
+
|
|
15
|
+
h, w = img.shape[:2]
|
|
16
|
+
scale = args.size / max(h, w)
|
|
17
|
+
new_h, new_w = int(h * scale), int(w * scale)
|
|
18
|
+
img_resized = cv2.resize(img, (new_w, new_h))
|
|
19
|
+
|
|
20
|
+
hsv = cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV)
|
|
21
|
+
|
|
22
|
+
lower = np.array([0, 30, 30])
|
|
23
|
+
upper = np.array([180, 255, 255])
|
|
24
|
+
mask = cv2.inRange(hsv, lower, upper)
|
|
25
|
+
|
|
26
|
+
mask = apply_morphology(mask, 'close', 15)
|
|
27
|
+
mask = apply_morphology(mask, 'open', 10)
|
|
28
|
+
|
|
29
|
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
30
|
+
|
|
31
|
+
contours = filter_contours(contours, min_area=500)
|
|
32
|
+
|
|
33
|
+
for i, cnt in enumerate(contours):
|
|
34
|
+
roi = extract_roi_with_alpha(img_resized, cnt, padding=20)
|
|
35
|
+
|
|
36
|
+
out_path = os.path.join(args.o, f'fruit_{i:03d}.png')
|
|
37
|
+
Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
|
|
38
|
+
|
|
39
|
+
print(f"Extracted {len(contours)} fruits to {args.o}")
|
|
40
|
+
|
|
41
|
+
def cmd_split_fruit_raw(args):
|
|
42
|
+
os.makedirs(args.o, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
img = cv2.imread(args.i)
|
|
45
|
+
if img is None:
|
|
46
|
+
print(f"Error: Cannot read image {args.i}")
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
|
50
|
+
|
|
51
|
+
lower = np.array([0, 30, 30])
|
|
52
|
+
upper = np.array([180, 255, 255])
|
|
53
|
+
mask = cv2.inRange(hsv, lower, upper)
|
|
54
|
+
|
|
55
|
+
mask = apply_morphology(mask, 'close', 25)
|
|
56
|
+
mask = apply_morphology(mask, 'open', 15)
|
|
57
|
+
|
|
58
|
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
59
|
+
|
|
60
|
+
contours = filter_contours(contours, min_area=2000)
|
|
61
|
+
|
|
62
|
+
for i, cnt in enumerate(contours):
|
|
63
|
+
roi = extract_roi_with_alpha(img, cnt, padding=30)
|
|
64
|
+
|
|
65
|
+
out_path = os.path.join(args.o, f'fruit_raw_{i:03d}.png')
|
|
66
|
+
Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
|
|
67
|
+
|
|
68
|
+
print(f"Extracted {len(contours)} fruits (original size) to {args.o}")
|
|
69
|
+
|
|
70
|
+
def cmd_split_leaf(args):
|
|
71
|
+
os.makedirs(args.o, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
img = cv2.imread(args.i)
|
|
74
|
+
if img is None:
|
|
75
|
+
print(f"Error: Cannot read image {args.i}")
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
h, w = img.shape[:2]
|
|
79
|
+
scale = args.size / max(h, w)
|
|
80
|
+
new_h, new_w = int(h * scale), int(w * scale)
|
|
81
|
+
img_resized = cv2.resize(img, (new_w, new_h))
|
|
82
|
+
|
|
83
|
+
hsv = cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV)
|
|
84
|
+
|
|
85
|
+
lower = np.array([35, 40, 40])
|
|
86
|
+
upper = np.array([85, 255, 255])
|
|
87
|
+
mask = cv2.inRange(hsv, lower, upper)
|
|
88
|
+
|
|
89
|
+
mask = apply_morphology(mask, 'close', 12)
|
|
90
|
+
mask = apply_morphology(mask, 'open', 8)
|
|
91
|
+
|
|
92
|
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
93
|
+
|
|
94
|
+
contours = filter_contours(contours, min_area=400)
|
|
95
|
+
|
|
96
|
+
for i, cnt in enumerate(contours):
|
|
97
|
+
roi = extract_roi_with_alpha(img_resized, cnt, padding=15)
|
|
98
|
+
|
|
99
|
+
out_path = os.path.join(args.o, f'leaf_{i:03d}.png')
|
|
100
|
+
Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGRA2RGBA)).save(out_path)
|
|
101
|
+
|
|
102
|
+
print(f"Extracted {len(contours)} leaves to {args.o}")
|
|
103
|
+
|
|
104
|
+
def cmd_split_leaf_edge(args):
|
|
105
|
+
os.makedirs(args.o, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
img = cv2.imread(args.i)
|
|
108
|
+
if img is None:
|
|
109
|
+
print(f"Error: Cannot read image {args.i}")
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
113
|
+
|
|
114
|
+
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
|
|
115
|
+
|
|
116
|
+
edges = cv2.Canny(blurred, 50, 150)
|
|
117
|
+
|
|
118
|
+
edges = apply_morphology(edges, 'dilate', 3)
|
|
119
|
+
edges = apply_morphology(edges, 'close', 5)
|
|
120
|
+
|
|
121
|
+
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
122
|
+
|
|
123
|
+
contours = filter_contours(contours, min_area=500)
|
|
124
|
+
|
|
125
|
+
for i, cnt in enumerate(contours):
|
|
126
|
+
mask = np.zeros(img.shape[:2], dtype=np.uint8)
|
|
127
|
+
cv2.drawContours(mask, [cnt], -1, 255, 2)
|
|
128
|
+
|
|
129
|
+
x, y, w, h = cv2.boundingRect(cnt)
|
|
130
|
+
x1, y1 = max(0, x-20), max(0, y-20)
|
|
131
|
+
x2, y2 = min(img.shape[1], x+w+20), min(img.shape[0], y+h+20)
|
|
132
|
+
|
|
133
|
+
roi_edge = mask[y1:y2, x1:x2]
|
|
134
|
+
roi_img = img[y1:y2, x1:x2]
|
|
135
|
+
|
|
136
|
+
b, g, r = cv2.split(roi_img)
|
|
137
|
+
rgba = cv2.merge([b, g, r, roi_edge])
|
|
138
|
+
|
|
139
|
+
out_path = os.path.join(args.o, f'leaf_edge_{i:03d}.png')
|
|
140
|
+
Image.fromarray(cv2.cvtColor(rgba, cv2.COLOR_BGRA2RGBA)).save(out_path)
|
|
141
|
+
|
|
142
|
+
print(f"Extracted {len(contours)} leaf edges to {args.o}")
|
jsrc-0.1.0/src/plot.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from common import setup_matplotlib, get_gene_structure, parse_gff_attributes, natural_sort_key
|
|
4
|
+
plt = setup_matplotlib()
|
|
5
|
+
|
|
6
|
+
def cmd_gene_structure(args):
|
|
7
|
+
with open(args.ids, 'r') as f:
|
|
8
|
+
gene_ids = [line.strip() for line in f if line.strip()]
|
|
9
|
+
|
|
10
|
+
coords = get_gene_structure(args.gff, gene_ids, feature_types=['CDS'])
|
|
11
|
+
|
|
12
|
+
gene_ids_sorted = sorted(gene_ids, key=natural_sort_key)
|
|
13
|
+
|
|
14
|
+
fig, ax = plt.subplots(figsize=(12, max(6, len(gene_ids_sorted) * 0.5)))
|
|
15
|
+
|
|
16
|
+
for i, gid in enumerate(gene_ids_sorted):
|
|
17
|
+
y = len(gene_ids_sorted) - i - 1
|
|
18
|
+
if gid not in coords or not coords[gid]:
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
cds_list = sorted(coords[gid])
|
|
22
|
+
gstart = min(c[0] for c in cds_list)
|
|
23
|
+
gend = max(c[1] for c in cds_list)
|
|
24
|
+
|
|
25
|
+
ax.plot([gstart, gend], [y, y], 'k-', linewidth=1)
|
|
26
|
+
|
|
27
|
+
for start, end in cds_list:
|
|
28
|
+
ax.add_patch(plt.Rectangle((start, y-0.15), end-start, 0.3, facecolor='steelblue', edgecolor='black'))
|
|
29
|
+
|
|
30
|
+
ax.set_yticks(range(len(gene_ids_sorted)))
|
|
31
|
+
ax.set_yticklabels(gene_ids_sorted[::-1])
|
|
32
|
+
ax.set_xlabel('Genomic Position')
|
|
33
|
+
ax.set_title('Gene Structure')
|
|
34
|
+
|
|
35
|
+
plt.tight_layout()
|
|
36
|
+
plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
|
|
37
|
+
plt.close()
|
|
38
|
+
print(f"Gene structure plot saved to {args.o}")
|
|
39
|
+
|
|
40
|
+
def cmd_exon_structure(args):
|
|
41
|
+
with open(args.ids, 'r') as f:
|
|
42
|
+
gene_ids = [line.strip() for line in f if line.strip()]
|
|
43
|
+
|
|
44
|
+
coords = get_gene_structure(args.gff, gene_ids, feature_types=['exon'])
|
|
45
|
+
|
|
46
|
+
gene_ids_sorted = sorted(gene_ids, key=natural_sort_key)
|
|
47
|
+
|
|
48
|
+
fig, ax = plt.subplots(figsize=(12, max(6, len(gene_ids_sorted) * 0.5)))
|
|
49
|
+
|
|
50
|
+
for i, gid in enumerate(gene_ids_sorted):
|
|
51
|
+
y = len(gene_ids_sorted) - i - 1
|
|
52
|
+
if gid not in coords or not coords[gid]:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
exon_list = sorted(coords[gid])
|
|
56
|
+
gstart = min(c[0] for c in exon_list)
|
|
57
|
+
gend = max(c[1] for c in exon_list)
|
|
58
|
+
|
|
59
|
+
ax.plot([gstart, gend], [y, y], 'k-', linewidth=1)
|
|
60
|
+
|
|
61
|
+
for start, end in exon_list:
|
|
62
|
+
ax.add_patch(plt.Rectangle((start, y-0.2), end-start, 0.4, facecolor='green', edgecolor='black'))
|
|
63
|
+
|
|
64
|
+
ax.set_yticks(range(len(gene_ids_sorted)))
|
|
65
|
+
ax.set_yticklabels(gene_ids_sorted[::-1])
|
|
66
|
+
ax.set_xlabel('Genomic Position')
|
|
67
|
+
ax.set_title('Exon Structure')
|
|
68
|
+
|
|
69
|
+
plt.tight_layout()
|
|
70
|
+
plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
|
|
71
|
+
plt.close()
|
|
72
|
+
print(f"Exon structure plot saved to {args.o}")
|
|
73
|
+
|
|
74
|
+
def cmd_chromosome_map(args):
|
|
75
|
+
chr_lengths = {}
|
|
76
|
+
gene_positions = []
|
|
77
|
+
|
|
78
|
+
with open(args.gff, 'r') as f:
|
|
79
|
+
for line in f:
|
|
80
|
+
if line.startswith('##sequence-region'):
|
|
81
|
+
parts = line.strip().split()
|
|
82
|
+
if len(parts) >= 4:
|
|
83
|
+
chr_name = parts[1]
|
|
84
|
+
chr_len = int(parts[3])
|
|
85
|
+
chr_lengths[chr_name] = chr_len
|
|
86
|
+
elif not line.startswith('#'):
|
|
87
|
+
parts = line.strip().split('\t')
|
|
88
|
+
if len(parts) >= 9 and parts[2] == 'gene':
|
|
89
|
+
chrom = parts[0]
|
|
90
|
+
start = int(parts[3])
|
|
91
|
+
end = int(parts[4])
|
|
92
|
+
|
|
93
|
+
attr = parse_gff_attributes(parts[8])
|
|
94
|
+
gene_id = attr.get('ID', '')
|
|
95
|
+
|
|
96
|
+
gene_positions.append({
|
|
97
|
+
'chr': chrom,
|
|
98
|
+
'start': start,
|
|
99
|
+
'end': end,
|
|
100
|
+
'id': gene_id
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
if chrom not in chr_lengths:
|
|
104
|
+
chr_lengths[chrom] = max(chr_lengths.get(chrom, 0), end)
|
|
105
|
+
|
|
106
|
+
chr_sorted = sorted(chr_lengths.keys(), key=natural_sort_key)
|
|
107
|
+
|
|
108
|
+
fig, ax = plt.subplots(figsize=(12, max(6, len(chr_sorted) * 0.5)))
|
|
109
|
+
|
|
110
|
+
for i, chrom in enumerate(chr_sorted):
|
|
111
|
+
y = len(chr_sorted) - i - 1
|
|
112
|
+
chr_len = chr_lengths[chrom]
|
|
113
|
+
|
|
114
|
+
ax.add_patch(plt.Rectangle((0, y-0.2), chr_len, 0.4, facecolor='lightgray', edgecolor='black'))
|
|
115
|
+
|
|
116
|
+
chr_genes = [g for g in gene_positions if g['chr'] == chrom]
|
|
117
|
+
for gene in chr_genes:
|
|
118
|
+
mid = (gene['start'] + gene['end']) / 2
|
|
119
|
+
ax.plot([mid, mid], [y-0.15, y+0.15], 'r-', linewidth=0.5, alpha=0.5)
|
|
120
|
+
|
|
121
|
+
ax.set_yticks(range(len(chr_sorted)))
|
|
122
|
+
ax.set_yticklabels(chr_sorted[::-1])
|
|
123
|
+
ax.set_xlabel('Position (bp)')
|
|
124
|
+
ax.set_title('Chromosome Map')
|
|
125
|
+
|
|
126
|
+
plt.tight_layout()
|
|
127
|
+
plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
|
|
128
|
+
plt.close()
|
|
129
|
+
print(f"Chromosome map saved to {args.o}")
|
|
130
|
+
|
|
131
|
+
def cmd_protein_domain(args):
|
|
132
|
+
df = pd.read_csv(args.tsv, sep='\t', comment='#')
|
|
133
|
+
|
|
134
|
+
required_cols = ['protein', 'domain', 'start', 'end']
|
|
135
|
+
if not all(col in df.columns for col in required_cols):
|
|
136
|
+
print(f"Error: TSV must have columns: {', '.join(required_cols)}", file=sys.stderr)
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
proteins = sorted(df['protein'].unique(), key=natural_sort_key)
|
|
140
|
+
|
|
141
|
+
fig, ax = plt.subplots(figsize=(12, max(6, len(proteins) * 0.5)))
|
|
142
|
+
|
|
143
|
+
for i, prot in enumerate(proteins):
|
|
144
|
+
y = len(proteins) - i - 1
|
|
145
|
+
prot_domains = df[df['protein'] == prot]
|
|
146
|
+
|
|
147
|
+
max_pos = prot_domains['end'].max()
|
|
148
|
+
ax.plot([0, max_pos], [y, y], 'k-', linewidth=2)
|
|
149
|
+
|
|
150
|
+
for _, row in prot_domains.iterrows():
|
|
151
|
+
start = row['start']
|
|
152
|
+
end = row['end']
|
|
153
|
+
domain = row['domain']
|
|
154
|
+
|
|
155
|
+
ax.add_patch(plt.Rectangle((start, y-0.2), end-start, 0.4,
|
|
156
|
+
facecolor='orange', edgecolor='black', alpha=0.7))
|
|
157
|
+
ax.text((start+end)/2, y, domain, ha='center', va='center', fontsize=8)
|
|
158
|
+
|
|
159
|
+
ax.set_yticks(range(len(proteins)))
|
|
160
|
+
ax.set_yticklabels(proteins[::-1])
|
|
161
|
+
ax.set_xlabel('Position (aa)')
|
|
162
|
+
ax.set_title('Protein Domain Architecture')
|
|
163
|
+
|
|
164
|
+
plt.tight_layout()
|
|
165
|
+
plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
|
|
166
|
+
plt.close()
|
|
167
|
+
print(f"Protein domain plot saved to {args.o}")
|
|
168
|
+
|
|
169
|
+
def cmd_cis_element(args):
|
|
170
|
+
elements = []
|
|
171
|
+
|
|
172
|
+
with open(args.bed, 'r') as f:
|
|
173
|
+
for line in f:
|
|
174
|
+
if line.startswith('#') or not line.strip():
|
|
175
|
+
continue
|
|
176
|
+
parts = line.strip().split('\t')
|
|
177
|
+
if len(parts) >= 4:
|
|
178
|
+
elements.append({
|
|
179
|
+
'chr': parts[0],
|
|
180
|
+
'start': int(parts[1]),
|
|
181
|
+
'end': int(parts[2]),
|
|
182
|
+
'name': parts[3]
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
chromosomes = sorted(set(e['chr'] for e in elements), key=natural_sort_key)
|
|
186
|
+
|
|
187
|
+
fig, ax = plt.subplots(figsize=(12, max(6, len(chromosomes) * 0.5)))
|
|
188
|
+
|
|
189
|
+
for i, chrom in enumerate(chromosomes):
|
|
190
|
+
y = len(chromosomes) - i - 1
|
|
191
|
+
chr_elements = [e for e in elements if e['chr'] == chrom]
|
|
192
|
+
|
|
193
|
+
if chr_elements:
|
|
194
|
+
max_pos = max(e['end'] for e in chr_elements)
|
|
195
|
+
ax.plot([0, max_pos], [y, y], 'k-', linewidth=1)
|
|
196
|
+
|
|
197
|
+
for elem in chr_elements:
|
|
198
|
+
mid = (elem['start'] + elem['end']) / 2
|
|
199
|
+
ax.plot([mid, mid], [y-0.3, y+0.3], 'b-', linewidth=2)
|
|
200
|
+
ax.text(mid, y+0.35, elem['name'], ha='center', fontsize=7, rotation=45)
|
|
201
|
+
|
|
202
|
+
ax.set_yticks(range(len(chromosomes)))
|
|
203
|
+
ax.set_yticklabels(chromosomes[::-1])
|
|
204
|
+
ax.set_xlabel('Position (bp)')
|
|
205
|
+
ax.set_title('Cis-regulatory Elements')
|
|
206
|
+
|
|
207
|
+
plt.tight_layout()
|
|
208
|
+
plt.savefig(args.o, dpi=args.dpi, bbox_inches='tight')
|
|
209
|
+
plt.close()
|
|
210
|
+
print(f"Cis-element plot saved to {args.o}")
|
jsrc-0.1.0/src/seq.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
import re
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio.Seq import Seq
|
|
6
|
+
from Bio.SeqRecord import SeqRecord
|
|
7
|
+
|
|
8
|
+
def cmd_extract(args):
|
|
9
|
+
with open(args.ids, 'r') as f:
|
|
10
|
+
raw_ids = [line.strip() for line in f if line.strip()]
|
|
11
|
+
target_ids = {rid.lower().split('.')[0]: rid for rid in raw_ids}
|
|
12
|
+
|
|
13
|
+
found_records = {}
|
|
14
|
+
|
|
15
|
+
with open(args.fa, 'r') as infile:
|
|
16
|
+
current_header = None
|
|
17
|
+
current_seq_lines = []
|
|
18
|
+
|
|
19
|
+
def save_record(header, seq_lines):
|
|
20
|
+
if not header:
|
|
21
|
+
return
|
|
22
|
+
full_header = header[1:]
|
|
23
|
+
first_word = full_header.split()[0]
|
|
24
|
+
|
|
25
|
+
clean_id = first_word.lower().split('.')[0]
|
|
26
|
+
seq = ''.join(seq_lines)
|
|
27
|
+
|
|
28
|
+
if clean_id in target_ids:
|
|
29
|
+
if clean_id not in found_records or len(seq) > len(found_records[clean_id]['seq']):
|
|
30
|
+
found_records[clean_id] = {
|
|
31
|
+
'header': target_ids[clean_id],
|
|
32
|
+
'seq': seq
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
for line in infile:
|
|
36
|
+
line = line.strip()
|
|
37
|
+
if not line:
|
|
38
|
+
continue
|
|
39
|
+
if line.startswith('>'):
|
|
40
|
+
save_record(current_header, current_seq_lines)
|
|
41
|
+
current_header = line
|
|
42
|
+
current_seq_lines = []
|
|
43
|
+
else:
|
|
44
|
+
current_seq_lines.append(line)
|
|
45
|
+
|
|
46
|
+
save_record(current_header, current_seq_lines)
|
|
47
|
+
|
|
48
|
+
with open(args.o, 'w') as outfile:
|
|
49
|
+
for clean_id in target_ids:
|
|
50
|
+
if clean_id in found_records:
|
|
51
|
+
rec = found_records[clean_id]
|
|
52
|
+
outfile.write(f">{rec['header']}\n{rec['seq']}\n")
|
|
53
|
+
|
|
54
|
+
print(f"Extracted {len(found_records)}/{len(target_ids)} sequences to {args.o}")
|
|
55
|
+
|
|
56
|
+
def cmd_rename(args):
|
|
57
|
+
mapping = {}
|
|
58
|
+
with open(args.map, 'r') as f:
|
|
59
|
+
reader = csv.reader(f)
|
|
60
|
+
for row in reader:
|
|
61
|
+
if len(row) >= 2:
|
|
62
|
+
mapping[row[0].strip()] = row[1].strip()
|
|
63
|
+
|
|
64
|
+
with open(args.fa, 'r') as fin, open(args.o, 'w') as fout:
|
|
65
|
+
for line in fin:
|
|
66
|
+
if line.startswith('>'):
|
|
67
|
+
old_id = line[1:].split()[0]
|
|
68
|
+
if old_id in mapping:
|
|
69
|
+
fout.write(f">{mapping[old_id]}\n")
|
|
70
|
+
else:
|
|
71
|
+
fout.write(line)
|
|
72
|
+
else:
|
|
73
|
+
fout.write(line)
|
|
74
|
+
|
|
75
|
+
print(f"Renamed {len(mapping)} sequences to {args.o}")
|
|
76
|
+
|
|
77
|
+
def cmd_rename_by_gff(args):
|
|
78
|
+
from common import parse_gff_attributes
|
|
79
|
+
|
|
80
|
+
mapping = {}
|
|
81
|
+
|
|
82
|
+
with open(args.gff, 'r') as f:
|
|
83
|
+
for line in f:
|
|
84
|
+
if line.startswith('#'):
|
|
85
|
+
continue
|
|
86
|
+
parts = line.strip().split('\t')
|
|
87
|
+
if len(parts) < 9:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
if parts[2] == 'mRNA':
|
|
91
|
+
attr = parse_gff_attributes(parts[8])
|
|
92
|
+
tid = attr.get('ID')
|
|
93
|
+
pid = attr.get(args.parent)
|
|
94
|
+
if tid and pid:
|
|
95
|
+
mapping[tid] = pid
|
|
96
|
+
|
|
97
|
+
with open(args.fa, 'r') as fin, open(args.o, 'w') as fout:
|
|
98
|
+
for line in fin:
|
|
99
|
+
if line.startswith('>'):
|
|
100
|
+
old_id = line[1:].split()[0]
|
|
101
|
+
if old_id in mapping:
|
|
102
|
+
fout.write(f">{mapping[old_id]}\n")
|
|
103
|
+
else:
|
|
104
|
+
fout.write(line)
|
|
105
|
+
else:
|
|
106
|
+
fout.write(line)
|
|
107
|
+
|
|
108
|
+
print(f"Renamed {len(mapping)} transcripts to {args.o}")
|
|
109
|
+
|
|
110
|
+
def cmd_translate(args):
|
|
111
|
+
from common import parse_gff_attributes
|
|
112
|
+
|
|
113
|
+
genome = SeqIO.to_dict(SeqIO.parse(args.fa, 'fasta'))
|
|
114
|
+
|
|
115
|
+
cds_dict = {}
|
|
116
|
+
|
|
117
|
+
with open(args.gff, 'r') as f:
|
|
118
|
+
for line in f:
|
|
119
|
+
if line.startswith('#'):
|
|
120
|
+
continue
|
|
121
|
+
parts = line.strip().split('\t')
|
|
122
|
+
if len(parts) < 9 or parts[2] != 'CDS':
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
chrom = parts[0]
|
|
126
|
+
start = int(parts[3]) - 1
|
|
127
|
+
end = int(parts[4])
|
|
128
|
+
strand = parts[6]
|
|
129
|
+
|
|
130
|
+
attr = parse_gff_attributes(parts[8])
|
|
131
|
+
gene_id = attr.get(args.id)
|
|
132
|
+
|
|
133
|
+
if not gene_id or chrom not in genome:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
if gene_id not in cds_dict:
|
|
137
|
+
cds_dict[gene_id] = {'chrom': chrom, 'strand': strand, 'regions': []}
|
|
138
|
+
|
|
139
|
+
cds_dict[gene_id]['regions'].append((start, end))
|
|
140
|
+
|
|
141
|
+
proteins = []
|
|
142
|
+
|
|
143
|
+
for gene_id, data in cds_dict.items():
|
|
144
|
+
chrom_seq = genome[data['chrom']].seq
|
|
145
|
+
|
|
146
|
+
regions = sorted(data['regions'])
|
|
147
|
+
cds_seq = Seq('')
|
|
148
|
+
for start, end in regions:
|
|
149
|
+
cds_seq += chrom_seq[start:end]
|
|
150
|
+
|
|
151
|
+
if data['strand'] == '-':
|
|
152
|
+
cds_seq = cds_seq.reverse_complement()
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
protein_seq = cds_seq.translate(to_stop=True)
|
|
156
|
+
if len(protein_seq) > 0:
|
|
157
|
+
proteins.append(SeqRecord(protein_seq, id=gene_id, description=''))
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Warning: Failed to translate {gene_id}: {e}", file=sys.stderr)
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
SeqIO.write(proteins, args.o, 'fasta')
|
|
163
|
+
print(f"Translated {len(proteins)} genes to {args.o}")
|