satellome 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satellome-1.0.0/.gitignore +139 -0
- satellome-1.0.0/LICENSE +21 -0
- satellome-1.0.0/MANIFEST.in +10 -0
- satellome-1.0.0/PKG-INFO +297 -0
- satellome-1.0.0/README.md +252 -0
- satellome-1.0.0/distances/distances.hpp +34 -0
- satellome-1.0.0/distances/embeding.cpp +1323 -0
- satellome-1.0.0/distances/thread_wrappers.hpp +121 -0
- satellome-1.0.0/distances/trf_file.hpp +47 -0
- satellome-1.0.0/pyproject.toml +61 -0
- satellome-1.0.0/requirements.txt +13 -0
- satellome-1.0.0/scripts/check_telomeres.py +92 -0
- satellome-1.0.0/scripts/plots.py +61 -0
- satellome-1.0.0/scripts/run_satellome_batch.sh +83 -0
- satellome-1.0.0/scripts/run_satellome_parallel.py +269 -0
- satellome-1.0.0/scripts/tandems.py +46 -0
- satellome-1.0.0/scripts/to_fasta.py +45 -0
- satellome-1.0.0/scripts/trf_get_large.py +41 -0
- satellome-1.0.0/scripts/trf_get_micro_stat.py +55 -0
- satellome-1.0.0/scripts/trf_refine.py +67 -0
- satellome-1.0.0/scripts/trf_to_coordinates.py +32 -0
- satellome-1.0.0/scripts/trf_to_fasta.py +33 -0
- satellome-1.0.0/scripts/trf_to_gff3.py +33 -0
- satellome-1.0.0/scripts/trs_set_chrm_names_for_fasta.py +56 -0
- satellome-1.0.0/scripts/yaml_parser.py +77 -0
- satellome-1.0.0/setup.cfg +4 -0
- satellome-1.0.0/setup.py +50 -0
- satellome-1.0.0/src/satellome/__init__.py +1 -0
- satellome-1.0.0/src/satellome/core_functions/__init__.py +0 -0
- satellome-1.0.0/src/satellome/core_functions/classification_micro.py +852 -0
- satellome-1.0.0/src/satellome/core_functions/io/__init__.py +0 -0
- satellome-1.0.0/src/satellome/core_functions/io/block_file.py +157 -0
- satellome-1.0.0/src/satellome/core_functions/io/fasta_file.py +45 -0
- satellome-1.0.0/src/satellome/core_functions/io/file_system.py +27 -0
- satellome-1.0.0/src/satellome/core_functions/io/gff_file.py +16 -0
- satellome-1.0.0/src/satellome/core_functions/io/tab_file.py +200 -0
- satellome-1.0.0/src/satellome/core_functions/io/tr_file.py +147 -0
- satellome-1.0.0/src/satellome/core_functions/io/trf_file.py +469 -0
- satellome-1.0.0/src/satellome/core_functions/models/__init__.py +0 -0
- satellome-1.0.0/src/satellome/core_functions/models/gff3_model.py +195 -0
- satellome-1.0.0/src/satellome/core_functions/models/trf_model.py +535 -0
- satellome-1.0.0/src/satellome/core_functions/settings.py +49 -0
- satellome-1.0.0/src/satellome/core_functions/tools/__init__.py +0 -0
- satellome-1.0.0/src/satellome/core_functions/tools/clusterization.py +87 -0
- satellome-1.0.0/src/satellome/core_functions/tools/distances.py +105 -0
- satellome-1.0.0/src/satellome/core_functions/tools/gene_intersect.py +193 -0
- satellome-1.0.0/src/satellome/core_functions/tools/ncbi.py +49 -0
- satellome-1.0.0/src/satellome/core_functions/tools/parsers.py +230 -0
- satellome-1.0.0/src/satellome/core_functions/tools/processing.py +58 -0
- satellome-1.0.0/src/satellome/core_functions/tools/reports.py +57 -0
- satellome-1.0.0/src/satellome/core_functions/tools/statistics.py +117 -0
- satellome-1.0.0/src/satellome/core_functions/tools/trf_tools.py +290 -0
- satellome-1.0.0/src/satellome/core_functions/trf_clusters.py +1066 -0
- satellome-1.0.0/src/satellome/core_functions/trf_drawing.py +193 -0
- satellome-1.0.0/src/satellome/core_functions/trf_embedings.py +106 -0
- satellome-1.0.0/src/satellome/main.py +299 -0
- satellome-1.0.0/src/satellome/redraw.py +128 -0
- satellome-1.0.0/src/satellome/settings.yaml +45 -0
- satellome-1.0.0/src/satellome/steps/__init__.py +0 -0
- satellome-1.0.0/src/satellome/steps/trf_classify.py +124 -0
- satellome-1.0.0/src/satellome/steps/trf_draw.py +69 -0
- satellome-1.0.0/src/satellome/steps/trf_parse_raw.py +29 -0
- satellome-1.0.0/src/satellome/steps/trf_search.py +79 -0
- satellome-1.0.0/src/satellome/trc_set_tr_names_color.py +104 -0
- satellome-1.0.0/src/satellome/trs_raw_reads.py +124 -0
- satellome-1.0.0/src/satellome.egg-info/PKG-INFO +297 -0
- satellome-1.0.0/src/satellome.egg-info/SOURCES.txt +70 -0
- satellome-1.0.0/src/satellome.egg-info/dependency_links.txt +1 -0
- satellome-1.0.0/src/satellome.egg-info/entry_points.txt +2 -0
- satellome-1.0.0/src/satellome.egg-info/requires.txt +13 -0
- satellome-1.0.0/src/satellome.egg-info/top_level.txt +1 -0
- satellome-1.0.0/tests/test_overlapping.py +31 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
TRF/
|
|
30
|
+
bpe.exe
|
|
31
|
+
# PyInstaller
|
|
32
|
+
# Usually these files are written by a python script from a template
|
|
33
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
34
|
+
*.manifest
|
|
35
|
+
*.spec
|
|
36
|
+
|
|
37
|
+
# Installer logs
|
|
38
|
+
pip-log.txt
|
|
39
|
+
pip-delete-this-directory.txt
|
|
40
|
+
|
|
41
|
+
# Unit test / coverage reports
|
|
42
|
+
htmlcov/
|
|
43
|
+
.tox/
|
|
44
|
+
.nox/
|
|
45
|
+
.coverage
|
|
46
|
+
.coverage.*
|
|
47
|
+
.cache
|
|
48
|
+
nosetests.xml
|
|
49
|
+
coverage.xml
|
|
50
|
+
*.cover
|
|
51
|
+
*.py,cover
|
|
52
|
+
.hypothesis/
|
|
53
|
+
.pytest_cache/
|
|
54
|
+
|
|
55
|
+
# Translations
|
|
56
|
+
*.mo
|
|
57
|
+
*.pot
|
|
58
|
+
|
|
59
|
+
# Django stuff:
|
|
60
|
+
*.log
|
|
61
|
+
local_settings.py
|
|
62
|
+
db.sqlite3
|
|
63
|
+
db.sqlite3-journal
|
|
64
|
+
|
|
65
|
+
# Flask stuff:
|
|
66
|
+
instance/
|
|
67
|
+
.webassets-cache
|
|
68
|
+
|
|
69
|
+
# Scrapy stuff:
|
|
70
|
+
.scrapy
|
|
71
|
+
|
|
72
|
+
# Sphinx documentation
|
|
73
|
+
docs/_build/
|
|
74
|
+
|
|
75
|
+
# PyBuilder
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
.python-version
|
|
87
|
+
|
|
88
|
+
# pipenv
|
|
89
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
90
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
91
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
92
|
+
# install all needed dependencies.
|
|
93
|
+
#Pipfile.lock
|
|
94
|
+
|
|
95
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
96
|
+
__pypackages__/
|
|
97
|
+
|
|
98
|
+
# Celery stuff
|
|
99
|
+
celerybeat-schedule
|
|
100
|
+
celerybeat.pid
|
|
101
|
+
|
|
102
|
+
# SageMath parsed files
|
|
103
|
+
*.sage.py
|
|
104
|
+
|
|
105
|
+
# Environments
|
|
106
|
+
.env
|
|
107
|
+
.venv
|
|
108
|
+
env/
|
|
109
|
+
venv/
|
|
110
|
+
ENV/
|
|
111
|
+
env.bak/
|
|
112
|
+
venv.bak/
|
|
113
|
+
|
|
114
|
+
# Spyder project settings
|
|
115
|
+
.spyderproject
|
|
116
|
+
.spyproject
|
|
117
|
+
|
|
118
|
+
# Rope project settings
|
|
119
|
+
.ropeproject
|
|
120
|
+
|
|
121
|
+
# mkdocs documentation
|
|
122
|
+
/site
|
|
123
|
+
|
|
124
|
+
# mypy
|
|
125
|
+
.mypy_cache/
|
|
126
|
+
.dmypy.json
|
|
127
|
+
dmypy.json
|
|
128
|
+
|
|
129
|
+
# Pyre type checker
|
|
130
|
+
.pyre/
|
|
131
|
+
.vscode/settings.json
|
|
132
|
+
|
|
133
|
+
TRF
|
|
134
|
+
test_dataset
|
|
135
|
+
examples_primates.sh
|
|
136
|
+
.claude
|
|
137
|
+
.vscode
|
|
138
|
+
CLAUDE.md
|
|
139
|
+
examples_primates.sh
|
satellome-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Aleksey Komissarov
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include requirements.txt
|
|
4
|
+
include src/satellome/settings.yaml
|
|
5
|
+
recursive-include src/satellome *.yaml *.yml
|
|
6
|
+
recursive-include scripts *.py *.sh
|
|
7
|
+
recursive-include tests *.py
|
|
8
|
+
recursive-exclude * __pycache__
|
|
9
|
+
recursive-exclude * *.py[co]
|
|
10
|
+
recursive-exclude * .DS_Store
|
satellome-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: satellome
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A comprehensive tool for satellite DNA analysis in T2T genome assemblies
|
|
5
|
+
Home-page: https://github.com/aglabx/Satellome
|
|
6
|
+
Author: Aleksey Komissarov
|
|
7
|
+
Author-email: Aleksey Komissarov <ad3002@gmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/aglabx/satellome
|
|
10
|
+
Project-URL: Documentation, https://github.com/aglabx/satellome/wiki
|
|
11
|
+
Project-URL: Repository, https://github.com/aglabx/satellome
|
|
12
|
+
Project-URL: Issues, https://github.com/aglabx/satellome/issues
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
25
|
+
Requires-Python: >=3.6
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: PyExp
|
|
29
|
+
Requires-Dist: pandas
|
|
30
|
+
Requires-Dist: numpy
|
|
31
|
+
Requires-Dist: scikit-learn
|
|
32
|
+
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: simplejson
|
|
34
|
+
Requires-Dist: intervaltree
|
|
35
|
+
Requires-Dist: tqdm
|
|
36
|
+
Requires-Dist: requests
|
|
37
|
+
Requires-Dist: plotly
|
|
38
|
+
Requires-Dist: kaleido
|
|
39
|
+
Requires-Dist: editdistance
|
|
40
|
+
Requires-Dist: networkx
|
|
41
|
+
Dynamic: author
|
|
42
|
+
Dynamic: home-page
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
|
|
46
|
+
# Satellome
|
|
47
|
+
|
|
48
|
+
A comprehensive bioinformatics tool for analyzing satellite DNA (tandem repeats) in telomere-to-telomere (T2T) genome assemblies.
|
|
49
|
+
|
|
50
|
+
## Overview
|
|
51
|
+
|
|
52
|
+
Satellome integrates Tandem Repeat Finder (TRF) to identify, classify, and visualize repetitive DNA sequences, with a particular focus on centromeric and telomeric regions. It provides a complete pipeline from raw genome sequences to detailed visualizations and reports of tandem repeat patterns.
|
|
53
|
+
|
|
54
|
+
The tool is designed to work with various genome assembly projects including:
|
|
55
|
+
- T2T (Telomere-to-Telomere) Consortium assemblies
|
|
56
|
+
- DNA Zoo chromosome-length assemblies
|
|
57
|
+
- VGP (Vertebrate Genome Project) assemblies
|
|
58
|
+
- NCBI RefSeq and GenBank assemblies
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- **Tandem Repeat Detection**: Automated detection using TRF with optimized parameters
|
|
63
|
+
- **Smart Classification**: Categorizes repeats into microsatellites, complex repeats, and other types
|
|
64
|
+
- **Rich Visualizations**: Generates karyotype plots, 3D visualizations, and distance matrices
|
|
65
|
+
- **Annotation Integration**: Supports GFF3 and RepeatMasker annotations
|
|
66
|
+
- **Parallel Processing**: Efficient handling of multiple genomes
|
|
67
|
+
- **Smart Pipeline**: Automatically skips completed steps (override with `--force`)
|
|
68
|
+
|
|
69
|
+
## Installation
|
|
70
|
+
|
|
71
|
+
### Prerequisites
|
|
72
|
+
|
|
73
|
+
- Python 3.9 or higher
|
|
74
|
+
- Conda (recommended) or pip
|
|
75
|
+
- TRF (Tandem Repeat Finder) binary
|
|
76
|
+
|
|
77
|
+
### Quick Setup
|
|
78
|
+
|
|
79
|
+
1. **Clone the repository**
|
|
80
|
+
```bash
|
|
81
|
+
git clone https://github.com/aglabx/satellome.git
|
|
82
|
+
cd satellome
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
2. **Create conda environment**
|
|
86
|
+
```bash
|
|
87
|
+
conda create -n satellome python=3.9
|
|
88
|
+
conda activate satellome
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
3. **Install dependencies**
|
|
92
|
+
```bash
|
|
93
|
+
pip install -r requirements.txt
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
4. **Install satellome**
|
|
97
|
+
```bash
|
|
98
|
+
pip install -e . # Development mode
|
|
99
|
+
# or
|
|
100
|
+
pip install . # Production mode
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
5. **Download TRF binary**
|
|
104
|
+
```bash
|
|
105
|
+
# Linux
|
|
106
|
+
wget https://github.com/Benson-Genomics-Lab/TRF/releases/download/v4.09.1/trf409.linux64
|
|
107
|
+
chmod +x trf409.linux64
|
|
108
|
+
mv trf409.linux64 trf
|
|
109
|
+
|
|
110
|
+
# macOS
|
|
111
|
+
wget https://github.com/Benson-Genomics-Lab/TRF/releases/download/v4.09.1/trf409.macosx
|
|
112
|
+
chmod +x trf409.macosx
|
|
113
|
+
mv trf409.macosx trf
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Usage
|
|
117
|
+
|
|
118
|
+
### Basic Command
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
satellome -i genome.fasta -o output_dir -p project_name -t 8
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Advanced Options
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# With GFF3 annotations
|
|
128
|
+
satellome -i genome.fasta -o output_dir -p project_name -t 8 --gff annotations.gff3
|
|
129
|
+
|
|
130
|
+
# With RepeatMasker annotations
|
|
131
|
+
satellome -i genome.fasta -o output_dir -p project_name -t 8 --rm repeatmasker.out
|
|
132
|
+
|
|
133
|
+
# Force rerun all steps
|
|
134
|
+
satellome -i genome.fasta -o output_dir -p project_name -t 8 --force
|
|
135
|
+
|
|
136
|
+
# Custom TRF binary path
|
|
137
|
+
satellome -i genome.fasta -o output_dir -p project_name -t 8 --trf /path/to/trf
|
|
138
|
+
|
|
139
|
+
# Parallel processing of multiple genomes
|
|
140
|
+
python scripts/run_satellome_parallel.py -i genomes_list.txt -o results_dir -t 32
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Parameters
|
|
144
|
+
|
|
145
|
+
- `-i, --input`: Input FASTA file (required)
|
|
146
|
+
- `-o, --output`: Output directory (required)
|
|
147
|
+
- `-p, --project`: Project name (required)
|
|
148
|
+
- `-t, --threads`: Number of threads (default: 1)
|
|
149
|
+
- `--gff`: GFF3 annotation file (optional)
|
|
150
|
+
- `--rm`: RepeatMasker output file (optional)
|
|
151
|
+
- `--trf`: Path to TRF binary (default: "trf")
|
|
152
|
+
- `--force`: Force rerun all steps
|
|
153
|
+
|
|
154
|
+
## Output Structure
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
output_dir/
|
|
158
|
+
├── genome_name.trf # Main TRF output file
|
|
159
|
+
├── genome_name.1kb.trf # Repeats >1kb
|
|
160
|
+
├── genome_name.3kb.trf # Repeats >3kb
|
|
161
|
+
├── genome_name.10kb.trf # Repeats >10kb
|
|
162
|
+
├── genome_name.micro.trf # Microsatellites (1-9 bp monomers)
|
|
163
|
+
├── genome_name.complex.trf # Complex repeats (>9 bp monomers)
|
|
164
|
+
├── genome_name.pmicro.trf # Potential microsatellites
|
|
165
|
+
├── genome_name.tssr.trf # Tandem simple sequence repeats
|
|
166
|
+
├── genome_name.*.gff3 # GFF3 format files for each category
|
|
167
|
+
├── genome_name.*.fa # FASTA files with repeat sequences
|
|
168
|
+
├── distances.tsv.* # Distance matrices with various extensions
|
|
169
|
+
├── images/
|
|
170
|
+
│ ├── *.png # Karyotype and other visualizations
|
|
171
|
+
│ └── *.svg # Vector graphics versions
|
|
172
|
+
└── reports/
|
|
173
|
+
├── satellome_report.html # Comprehensive HTML report
|
|
174
|
+
└── annotation_report.txt # Annotation intersection report (if GFF provided)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Classification System
|
|
178
|
+
|
|
179
|
+
Satellome classifies tandem repeats into four categories:
|
|
180
|
+
|
|
181
|
+
1. **micro**: Microsatellites (monomer length 1-9 bp)
|
|
182
|
+
2. **complex**: Complex repeats (monomer length >9 bp)
|
|
183
|
+
3. **pmicro**: Potential microsatellites
|
|
184
|
+
4. **tssr**: Tandem simple sequence repeats
|
|
185
|
+
|
|
186
|
+
## Utility Scripts
|
|
187
|
+
|
|
188
|
+
### Format Conversion
|
|
189
|
+
```bash
|
|
190
|
+
# Convert TRF to FASTA
|
|
191
|
+
python scripts/trf_to_fasta.py -i repeats.trf -o repeats.fasta
|
|
192
|
+
|
|
193
|
+
# Convert TRF to GFF3
|
|
194
|
+
python scripts/trf_to_gff3.py -i repeats.trf -o repeats.gff3
|
|
195
|
+
|
|
196
|
+
# Extract coordinates
|
|
197
|
+
python scripts/trf_to_coordinates.py -i repeats.trf -o coordinates.txt
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Analysis Tools
|
|
201
|
+
```bash
|
|
202
|
+
# Extract large tandem repeats
|
|
203
|
+
python scripts/trf_get_large.py -i repeats.trf -m 1000 -o large_repeats.trf
|
|
204
|
+
|
|
205
|
+
# Get microsatellite statistics
|
|
206
|
+
python scripts/trf_get_micro_stat.py -i repeats.trf -o micro_stats.txt
|
|
207
|
+
|
|
208
|
+
# Check telomeric repeats
|
|
209
|
+
python scripts/check_telomeres.py -i genome.fasta -t repeats.trf
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Example Workflow
|
|
213
|
+
|
|
214
|
+
### 1. Download Test Dataset
|
|
215
|
+
```bash
|
|
216
|
+
# Download S. cerevisiae genome
|
|
217
|
+
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000146045.2/download?include_annotation_type=GENOME_FASTA,GENOME_GFF&filename=GCF_000146045.2.zip" -H "Accept: application/zip"
|
|
218
|
+
unzip GCF_000146045.2.zip
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### 2. Run Analysis
|
|
222
|
+
```bash
|
|
223
|
+
# Run satellome pipeline
|
|
224
|
+
satellome -i ncbi_dataset/data/GCF_000146045.2/GCF_000146045.2_R64_genomic.fna \
|
|
225
|
+
-o results \
|
|
226
|
+
-p scerevisiae \
|
|
227
|
+
-t 8 \
|
|
228
|
+
--gff ncbi_dataset/data/GCF_000146045.2/genomic.gff
|
|
229
|
+
|
|
230
|
+
# View results
|
|
231
|
+
open results/scerevisiae_report.html
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### 3. Analyzing DNA Zoo Assemblies
|
|
235
|
+
```bash
|
|
236
|
+
# Download a DNA Zoo assembly (example: Aardvark)
|
|
237
|
+
wgethttps://dnazoo.s3.wasabisys.com/Acinonyx_jubatus/aciJub1_HiC.fasta.gz
|
|
238
|
+
gzip -d aciJub1_HiC.fasta.gz
|
|
239
|
+
|
|
240
|
+
# Run satellome on DNA Zoo assembly
|
|
241
|
+
satellome -i aciJub1_HiC.fasta \
|
|
242
|
+
-o dnazoo_results \
|
|
243
|
+
-p cheetah \
|
|
244
|
+
-t 8
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## Configuration
|
|
248
|
+
|
|
249
|
+
The pipeline uses `settings.yaml` for tool parameters. Key settings include:
|
|
250
|
+
|
|
251
|
+
- TRF parameters (match/mismatch scores, indel penalties)
|
|
252
|
+
- Minimum/maximum repeat lengths
|
|
253
|
+
- Classification thresholds
|
|
254
|
+
- Visualization parameters
|
|
255
|
+
|
|
256
|
+
## Testing
|
|
257
|
+
|
|
258
|
+
Run the test suite:
|
|
259
|
+
```bash
|
|
260
|
+
python tests/test_overlapping.py
|
|
261
|
+
python test_standalone.py
|
|
262
|
+
python test_chromosome_sorting.py
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## Contributing
|
|
266
|
+
|
|
267
|
+
1. Fork the repository
|
|
268
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
269
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
270
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
271
|
+
5. Open a Pull Request
|
|
272
|
+
|
|
273
|
+
## Citation
|
|
274
|
+
|
|
275
|
+
If you use Satellome in your research, please cite:
|
|
276
|
+
|
|
277
|
+
```
|
|
278
|
+
Komissarov A. et al. (2024). Satellome: A comprehensive tool for satellite DNA
|
|
279
|
+
analysis in T2T genome assemblies. [Publication details]
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## License
|
|
283
|
+
|
|
284
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
285
|
+
|
|
286
|
+
## Support
|
|
287
|
+
|
|
288
|
+
- **Issues**: [GitHub Issues](https://github.com/aglabx/satellome/issues)
|
|
289
|
+
- **Documentation**: [Wiki](https://github.com/aglabx/satellome/wiki)
|
|
290
|
+
- **Email**: ad3002@gmail.com
|
|
291
|
+
|
|
292
|
+
## Acknowledgments
|
|
293
|
+
|
|
294
|
+
- [Tandem Repeat Finder](https://github.com/Benson-Genomics-Lab/TRF) by Gary Benson
|
|
295
|
+
- [T2T Consortium](https://www.genome.gov/about-nhgri/telomere-to-telomere) for inspiring this work
|
|
296
|
+
- [DNA Zoo](https://www.dnazoo.org/) for providing chromosome-length assemblies
|
|
297
|
+
- [Vertebrate Genome Project](https://vertebrategenomesproject.org/) for high-quality reference genomes
|