satellome 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. satellome-1.0.0/.gitignore +139 -0
  2. satellome-1.0.0/LICENSE +21 -0
  3. satellome-1.0.0/MANIFEST.in +10 -0
  4. satellome-1.0.0/PKG-INFO +297 -0
  5. satellome-1.0.0/README.md +252 -0
  6. satellome-1.0.0/distances/distances.hpp +34 -0
  7. satellome-1.0.0/distances/embeding.cpp +1323 -0
  8. satellome-1.0.0/distances/thread_wrappers.hpp +121 -0
  9. satellome-1.0.0/distances/trf_file.hpp +47 -0
  10. satellome-1.0.0/pyproject.toml +61 -0
  11. satellome-1.0.0/requirements.txt +13 -0
  12. satellome-1.0.0/scripts/check_telomeres.py +92 -0
  13. satellome-1.0.0/scripts/plots.py +61 -0
  14. satellome-1.0.0/scripts/run_satellome_batch.sh +83 -0
  15. satellome-1.0.0/scripts/run_satellome_parallel.py +269 -0
  16. satellome-1.0.0/scripts/tandems.py +46 -0
  17. satellome-1.0.0/scripts/to_fasta.py +45 -0
  18. satellome-1.0.0/scripts/trf_get_large.py +41 -0
  19. satellome-1.0.0/scripts/trf_get_micro_stat.py +55 -0
  20. satellome-1.0.0/scripts/trf_refine.py +67 -0
  21. satellome-1.0.0/scripts/trf_to_coordinates.py +32 -0
  22. satellome-1.0.0/scripts/trf_to_fasta.py +33 -0
  23. satellome-1.0.0/scripts/trf_to_gff3.py +33 -0
  24. satellome-1.0.0/scripts/trs_set_chrm_names_for_fasta.py +56 -0
  25. satellome-1.0.0/scripts/yaml_parser.py +77 -0
  26. satellome-1.0.0/setup.cfg +4 -0
  27. satellome-1.0.0/setup.py +50 -0
  28. satellome-1.0.0/src/satellome/__init__.py +1 -0
  29. satellome-1.0.0/src/satellome/core_functions/__init__.py +0 -0
  30. satellome-1.0.0/src/satellome/core_functions/classification_micro.py +852 -0
  31. satellome-1.0.0/src/satellome/core_functions/io/__init__.py +0 -0
  32. satellome-1.0.0/src/satellome/core_functions/io/block_file.py +157 -0
  33. satellome-1.0.0/src/satellome/core_functions/io/fasta_file.py +45 -0
  34. satellome-1.0.0/src/satellome/core_functions/io/file_system.py +27 -0
  35. satellome-1.0.0/src/satellome/core_functions/io/gff_file.py +16 -0
  36. satellome-1.0.0/src/satellome/core_functions/io/tab_file.py +200 -0
  37. satellome-1.0.0/src/satellome/core_functions/io/tr_file.py +147 -0
  38. satellome-1.0.0/src/satellome/core_functions/io/trf_file.py +469 -0
  39. satellome-1.0.0/src/satellome/core_functions/models/__init__.py +0 -0
  40. satellome-1.0.0/src/satellome/core_functions/models/gff3_model.py +195 -0
  41. satellome-1.0.0/src/satellome/core_functions/models/trf_model.py +535 -0
  42. satellome-1.0.0/src/satellome/core_functions/settings.py +49 -0
  43. satellome-1.0.0/src/satellome/core_functions/tools/__init__.py +0 -0
  44. satellome-1.0.0/src/satellome/core_functions/tools/clusterization.py +87 -0
  45. satellome-1.0.0/src/satellome/core_functions/tools/distances.py +105 -0
  46. satellome-1.0.0/src/satellome/core_functions/tools/gene_intersect.py +193 -0
  47. satellome-1.0.0/src/satellome/core_functions/tools/ncbi.py +49 -0
  48. satellome-1.0.0/src/satellome/core_functions/tools/parsers.py +230 -0
  49. satellome-1.0.0/src/satellome/core_functions/tools/processing.py +58 -0
  50. satellome-1.0.0/src/satellome/core_functions/tools/reports.py +57 -0
  51. satellome-1.0.0/src/satellome/core_functions/tools/statistics.py +117 -0
  52. satellome-1.0.0/src/satellome/core_functions/tools/trf_tools.py +290 -0
  53. satellome-1.0.0/src/satellome/core_functions/trf_clusters.py +1066 -0
  54. satellome-1.0.0/src/satellome/core_functions/trf_drawing.py +193 -0
  55. satellome-1.0.0/src/satellome/core_functions/trf_embedings.py +106 -0
  56. satellome-1.0.0/src/satellome/main.py +299 -0
  57. satellome-1.0.0/src/satellome/redraw.py +128 -0
  58. satellome-1.0.0/src/satellome/settings.yaml +45 -0
  59. satellome-1.0.0/src/satellome/steps/__init__.py +0 -0
  60. satellome-1.0.0/src/satellome/steps/trf_classify.py +124 -0
  61. satellome-1.0.0/src/satellome/steps/trf_draw.py +69 -0
  62. satellome-1.0.0/src/satellome/steps/trf_parse_raw.py +29 -0
  63. satellome-1.0.0/src/satellome/steps/trf_search.py +79 -0
  64. satellome-1.0.0/src/satellome/trc_set_tr_names_color.py +104 -0
  65. satellome-1.0.0/src/satellome/trs_raw_reads.py +124 -0
  66. satellome-1.0.0/src/satellome.egg-info/PKG-INFO +297 -0
  67. satellome-1.0.0/src/satellome.egg-info/SOURCES.txt +70 -0
  68. satellome-1.0.0/src/satellome.egg-info/dependency_links.txt +1 -0
  69. satellome-1.0.0/src/satellome.egg-info/entry_points.txt +2 -0
  70. satellome-1.0.0/src/satellome.egg-info/requires.txt +13 -0
  71. satellome-1.0.0/src/satellome.egg-info/top_level.txt +1 -0
  72. satellome-1.0.0/tests/test_overlapping.py +31 -0
@@ -0,0 +1,139 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+ TRF/
30
+ bpe.exe
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
91
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
92
+ # install all needed dependencies.
93
+ #Pipfile.lock
94
+
95
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96
+ __pypackages__/
97
+
98
+ # Celery stuff
99
+ celerybeat-schedule
100
+ celerybeat.pid
101
+
102
+ # SageMath parsed files
103
+ *.sage.py
104
+
105
+ # Environments
106
+ .env
107
+ .venv
108
+ env/
109
+ venv/
110
+ ENV/
111
+ env.bak/
112
+ venv.bak/
113
+
114
+ # Spyder project settings
115
+ .spyderproject
116
+ .spyproject
117
+
118
+ # Rope project settings
119
+ .ropeproject
120
+
121
+ # mkdocs documentation
122
+ /site
123
+
124
+ # mypy
125
+ .mypy_cache/
126
+ .dmypy.json
127
+ dmypy.json
128
+
129
+ # Pyre type checker
130
+ .pyre/
131
+ .vscode/settings.json
132
+
133
+ TRF
134
+ test_dataset
135
+ examples_primates.sh
136
+ .claude
137
+ .vscode
138
+ CLAUDE.md
139
+ examples_primates.sh
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Aleksey Komissarov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,10 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt
4
+ include src/satellome/settings.yaml
5
+ recursive-include src/satellome *.yaml *.yml
6
+ recursive-include scripts *.py *.sh
7
+ recursive-include tests *.py
8
+ recursive-exclude * __pycache__
9
+ recursive-exclude * *.py[co]
10
+ recursive-exclude * .DS_Store
@@ -0,0 +1,297 @@
1
+ Metadata-Version: 2.4
2
+ Name: satellome
3
+ Version: 1.0.0
4
+ Summary: A comprehensive tool for satellite DNA analysis in T2T genome assemblies
5
+ Home-page: https://github.com/aglabx/Satellome
6
+ Author: Aleksey Komissarov
7
+ Author-email: Aleksey Komissarov <ad3002@gmail.com>
8
+ License-Expression: MIT
9
+ Project-URL: Homepage, https://github.com/aglabx/satellome
10
+ Project-URL: Documentation, https://github.com/aglabx/satellome/wiki
11
+ Project-URL: Repository, https://github.com/aglabx/satellome
12
+ Project-URL: Issues, https://github.com/aglabx/satellome/issues
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.6
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
25
+ Requires-Python: >=3.6
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: PyExp
29
+ Requires-Dist: pandas
30
+ Requires-Dist: numpy
31
+ Requires-Dist: scikit-learn
32
+ Requires-Dist: pyyaml
33
+ Requires-Dist: simplejson
34
+ Requires-Dist: intervaltree
35
+ Requires-Dist: tqdm
36
+ Requires-Dist: requests
37
+ Requires-Dist: plotly
38
+ Requires-Dist: kaleido
39
+ Requires-Dist: editdistance
40
+ Requires-Dist: networkx
41
+ Dynamic: author
42
+ Dynamic: home-page
43
+ Dynamic: license-file
44
+ Dynamic: requires-python
45
+
46
+ # Satellome
47
+
48
+ A comprehensive bioinformatics tool for analyzing satellite DNA (tandem repeats) in telomere-to-telomere (T2T) genome assemblies.
49
+
50
+ ## Overview
51
+
52
+ Satellome integrates Tandem Repeat Finder (TRF) to identify, classify, and visualize repetitive DNA sequences, with a particular focus on centromeric and telomeric regions. It provides a complete pipeline from raw genome sequences to detailed visualizations and reports of tandem repeat patterns.
53
+
54
+ The tool is designed to work with various genome assembly projects including:
55
+ - T2T (Telomere-to-Telomere) Consortium assemblies
56
+ - DNA Zoo chromosome-length assemblies
57
+ - VGP (Vertebrate Genome Project) assemblies
58
+ - NCBI RefSeq and GenBank assemblies
59
+
60
+ ## Features
61
+
62
+ - **Tandem Repeat Detection**: Automated detection using TRF with optimized parameters
63
+ - **Smart Classification**: Categorizes repeats into microsatellites, complex repeats, and other types
64
+ - **Rich Visualizations**: Generates karyotype plots, 3D visualizations, and distance matrices
65
+ - **Annotation Integration**: Supports GFF3 and RepeatMasker annotations
66
+ - **Parallel Processing**: Efficient handling of multiple genomes
67
+ - **Smart Pipeline**: Automatically skips completed steps (override with `--force`)
68
+
69
+ ## Installation
70
+
71
+ ### Prerequisites
72
+
73
+ - Python 3.9 or higher
74
+ - Conda (recommended) or pip
75
+ - TRF (Tandem Repeat Finder) binary
76
+
77
+ ### Quick Setup
78
+
79
+ 1. **Clone the repository**
80
+ ```bash
81
+ git clone https://github.com/aglabx/satellome.git
82
+ cd satellome
83
+ ```
84
+
85
+ 2. **Create conda environment**
86
+ ```bash
87
+ conda create -n satellome python=3.9
88
+ conda activate satellome
89
+ ```
90
+
91
+ 3. **Install dependencies**
92
+ ```bash
93
+ pip install -r requirements.txt
94
+ ```
95
+
96
+ 4. **Install satellome**
97
+ ```bash
98
+ pip install -e . # Development mode
99
+ # or
100
+ pip install . # Production mode
101
+ ```
102
+
103
+ 5. **Download TRF binary**
104
+ ```bash
105
+ # Linux
106
+ wget https://github.com/Benson-Genomics-Lab/TRF/releases/download/v4.09.1/trf409.linux64
107
+ chmod +x trf409.linux64
108
+ mv trf409.linux64 trf
109
+
110
+ # macOS
111
+ wget https://github.com/Benson-Genomics-Lab/TRF/releases/download/v4.09.1/trf409.macosx
112
+ chmod +x trf409.macosx
113
+ mv trf409.macosx trf
114
+ ```
115
+
116
+ ## Usage
117
+
118
+ ### Basic Command
119
+
120
+ ```bash
121
+ satellome -i genome.fasta -o output_dir -p project_name -t 8
122
+ ```
123
+
124
+ ### Advanced Options
125
+
126
+ ```bash
127
+ # With GFF3 annotations
128
+ satellome -i genome.fasta -o output_dir -p project_name -t 8 --gff annotations.gff3
129
+
130
+ # With RepeatMasker annotations
131
+ satellome -i genome.fasta -o output_dir -p project_name -t 8 --rm repeatmasker.out
132
+
133
+ # Force rerun all steps
134
+ satellome -i genome.fasta -o output_dir -p project_name -t 8 --force
135
+
136
+ # Custom TRF binary path
137
+ satellome -i genome.fasta -o output_dir -p project_name -t 8 --trf /path/to/trf
138
+
139
+ # Parallel processing of multiple genomes
140
+ python scripts/run_satellome_parallel.py -i genomes_list.txt -o results_dir -t 32
141
+ ```
142
+
143
+ ### Parameters
144
+
145
+ - `-i, --input`: Input FASTA file (required)
146
+ - `-o, --output`: Output directory (required)
147
+ - `-p, --project`: Project name (required)
148
+ - `-t, --threads`: Number of threads (default: 1)
149
+ - `--gff`: GFF3 annotation file (optional)
150
+ - `--rm`: RepeatMasker output file (optional)
151
+ - `--trf`: Path to TRF binary (default: "trf")
152
+ - `--force`: Force rerun all steps
153
+
154
+ ## Output Structure
155
+
156
+ ```
157
+ output_dir/
158
+ ├── genome_name.trf # Main TRF output file
159
+ ├── genome_name.1kb.trf # Repeats >1kb
160
+ ├── genome_name.3kb.trf # Repeats >3kb
161
+ ├── genome_name.10kb.trf # Repeats >10kb
162
+ ├── genome_name.micro.trf # Microsatellites (1-9 bp monomers)
163
+ ├── genome_name.complex.trf # Complex repeats (>9 bp monomers)
164
+ ├── genome_name.pmicro.trf # Potential microsatellites
165
+ ├── genome_name.tssr.trf # Tandem simple sequence repeats
166
+ ├── genome_name.*.gff3 # GFF3 format files for each category
167
+ ├── genome_name.*.fa # FASTA files with repeat sequences
168
+ ├── distances.tsv.* # Distance matrices with various extensions
169
+ ├── images/
170
+ │ ├── *.png # Karyotype and other visualizations
171
+ │ └── *.svg # Vector graphics versions
172
+ └── reports/
173
+ ├── satellome_report.html # Comprehensive HTML report
174
+ └── annotation_report.txt # Annotation intersection report (if GFF provided)
175
+ ```
176
+
177
+ ## Classification System
178
+
179
+ Satellome classifies tandem repeats into four categories:
180
+
181
+ 1. **micro**: Microsatellites (monomer length 1-9 bp)
182
+ 2. **complex**: Complex repeats (monomer length >9 bp)
183
+ 3. **pmicro**: Potential microsatellites
184
+ 4. **tssr**: Tandem simple sequence repeats
185
+
186
+ ## Utility Scripts
187
+
188
+ ### Format Conversion
189
+ ```bash
190
+ # Convert TRF to FASTA
191
+ python scripts/trf_to_fasta.py -i repeats.trf -o repeats.fasta
192
+
193
+ # Convert TRF to GFF3
194
+ python scripts/trf_to_gff3.py -i repeats.trf -o repeats.gff3
195
+
196
+ # Extract coordinates
197
+ python scripts/trf_to_coordinates.py -i repeats.trf -o coordinates.txt
198
+ ```
199
+
200
+ ### Analysis Tools
201
+ ```bash
202
+ # Extract large tandem repeats
203
+ python scripts/trf_get_large.py -i repeats.trf -m 1000 -o large_repeats.trf
204
+
205
+ # Get microsatellite statistics
206
+ python scripts/trf_get_micro_stat.py -i repeats.trf -o micro_stats.txt
207
+
208
+ # Check telomeric repeats
209
+ python scripts/check_telomeres.py -i genome.fasta -t repeats.trf
210
+ ```
211
+
212
+ ## Example Workflow
213
+
214
+ ### 1. Download Test Dataset
215
+ ```bash
216
+ # Download S. cerevisiae genome
217
+ curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000146045.2/download?include_annotation_type=GENOME_FASTA,GENOME_GFF&filename=GCF_000146045.2.zip" -H "Accept: application/zip"
218
+ unzip GCF_000146045.2.zip
219
+ ```
220
+
221
+ ### 2. Run Analysis
222
+ ```bash
223
+ # Run satellome pipeline
224
+ satellome -i ncbi_dataset/data/GCF_000146045.2/GCF_000146045.2_R64_genomic.fna \
225
+ -o results \
226
+ -p scerevisiae \
227
+ -t 8 \
228
+ --gff ncbi_dataset/data/GCF_000146045.2/genomic.gff
229
+
230
+ # View results
231
+ open results/scerevisiae_report.html
232
+ ```
233
+
234
+ ### 3. Analyzing DNA Zoo Assemblies
235
+ ```bash
236
+ # Download a DNA Zoo assembly (example: Aardvark)
237
+ wgethttps://dnazoo.s3.wasabisys.com/Acinonyx_jubatus/aciJub1_HiC.fasta.gz
238
+ gzip -d aciJub1_HiC.fasta.gz
239
+
240
+ # Run satellome on DNA Zoo assembly
241
+ satellome -i aciJub1_HiC.fasta \
242
+ -o dnazoo_results \
243
+ -p cheetah \
244
+ -t 8
245
+ ```
246
+
247
+ ## Configuration
248
+
249
+ The pipeline uses `settings.yaml` for tool parameters. Key settings include:
250
+
251
+ - TRF parameters (match/mismatch scores, indel penalties)
252
+ - Minimum/maximum repeat lengths
253
+ - Classification thresholds
254
+ - Visualization parameters
255
+
256
+ ## Testing
257
+
258
+ Run the test suite:
259
+ ```bash
260
+ python tests/test_overlapping.py
261
+ python test_standalone.py
262
+ python test_chromosome_sorting.py
263
+ ```
264
+
265
+ ## Contributing
266
+
267
+ 1. Fork the repository
268
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
269
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
270
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
271
+ 5. Open a Pull Request
272
+
273
+ ## Citation
274
+
275
+ If you use Satellome in your research, please cite:
276
+
277
+ ```
278
+ Komissarov A. et al. (2024). Satellome: A comprehensive tool for satellite DNA
279
+ analysis in T2T genome assemblies. [Publication details]
280
+ ```
281
+
282
+ ## License
283
+
284
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
285
+
286
+ ## Support
287
+
288
+ - **Issues**: [GitHub Issues](https://github.com/aglabx/satellome/issues)
289
+ - **Documentation**: [Wiki](https://github.com/aglabx/satellome/wiki)
290
+ - **Email**: ad3002@gmail.com
291
+
292
+ ## Acknowledgments
293
+
294
+ - [Tandem Repeat Finder](https://github.com/Benson-Genomics-Lab/TRF) by Gary Benson
295
+ - [T2T Consortium](https://www.genome.gov/about-nhgri/telomere-to-telomere) for inspiring this work
296
+ - [DNA Zoo](https://www.dnazoo.org/) for providing chromosome-length assemblies
297
+ - [Vertebrate Genome Project](https://vertebrategenomesproject.org/) for high-quality reference genomes