datasmryzr 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. datasmryzr-0.0.1/.github/workflows/ci.yml +29 -0
  2. datasmryzr-0.0.1/.github/workflows/python-package.yml +41 -0
  3. datasmryzr-0.0.1/.gitignore +12 -0
  4. datasmryzr-0.0.1/LICENSE.txt +9 -0
  5. datasmryzr-0.0.1/PKG-INFO +54 -0
  6. datasmryzr-0.0.1/README.md +28 -0
  7. datasmryzr-0.0.1/docs/index.md +153 -0
  8. datasmryzr-0.0.1/mkdocs.yml +31 -0
  9. datasmryzr-0.0.1/pyproject.toml +79 -0
  10. datasmryzr-0.0.1/src/datasmryzr/__about__.py +4 -0
  11. datasmryzr-0.0.1/src/datasmryzr/__init__.py +3 -0
  12. datasmryzr-0.0.1/src/datasmryzr/annotate.py +270 -0
  13. datasmryzr-0.0.1/src/datasmryzr/clusters.py +315 -0
  14. datasmryzr-0.0.1/src/datasmryzr/core_genome.py +297 -0
  15. datasmryzr-0.0.1/src/datasmryzr/datasmryzr.py +97 -0
  16. datasmryzr-0.0.1/src/datasmryzr/distances.py +106 -0
  17. datasmryzr-0.0.1/src/datasmryzr/pangenome.py +211 -0
  18. datasmryzr-0.0.1/src/datasmryzr/smryz.py +472 -0
  19. datasmryzr-0.0.1/src/datasmryzr/summary.py +110 -0
  20. datasmryzr-0.0.1/src/datasmryzr/tables.py +243 -0
  21. datasmryzr-0.0.1/src/datasmryzr/templates/base_config.json +28 -0
  22. datasmryzr-0.0.1/src/datasmryzr/templates/report.html.j2 +899 -0
  23. datasmryzr-0.0.1/src/datasmryzr/tree.py +27 -0
  24. datasmryzr-0.0.1/src/datasmryzr/utils.py +94 -0
  25. datasmryzr-0.0.1/tests/__init__.py +3 -0
  26. datasmryzr-0.0.1/tests/test_annotate.py +78 -0
  27. datasmryzr-0.0.1/tests/test_core_genome.py +103 -0
  28. datasmryzr-0.0.1/tests/test_distances.py +59 -0
  29. datasmryzr-0.0.1/tests/test_tables.py +88 -0
  30. datasmryzr-0.0.1/tests/test_tree.py +26 -0
  31. datasmryzr-0.0.1/tests/test_utils.py +21 -0
@@ -0,0 +1,29 @@
1
+ name: ci
2
+ on:
3
+ push:
4
+ branches:
5
+ - master
6
+ - main
7
+ permissions:
8
+ contents: write
9
+ jobs:
10
+ deploy:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Configure Git Credentials
15
+ run: |
16
+ git config user.name github-actions[bot]
17
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: 3.x
21
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
22
+ - uses: actions/cache@v4
23
+ with:
24
+ key: mkdocs-material-${{ env.cache_id }}
25
+ path: .cache
26
+ restore-keys: |
27
+ mkdocs-material-
28
+ - run: pip install mkdocs-material
29
+ - run: mkdocs gh-deploy --force
@@ -0,0 +1,41 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Python package
5
+
6
+ on:
7
+ push:
8
+ branches: [ "master" ]
9
+ pull_request:
10
+ branches: [ "master" ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.10", "3.11"]
20
+
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ - name: Set up Python ${{ matrix.python-version }}
24
+ uses: actions/setup-python@v3
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ python -m pip install flake8 pytest
31
+ python -m pip install .
32
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33
+ - name: Lint with flake8
34
+ run: |
35
+ # stop the build if there are Python syntax errors or undefined names
36
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39
+ - name: Test with pytest
40
+ run: |
41
+ pytest
@@ -0,0 +1,12 @@
1
+ .vscode
2
+ .pytest_cache
3
+ __pycache__
4
+ .pypirc
5
+ token.json
6
+ *.egg-info
7
+ *.egg
8
+ *.whl
9
+ *.tar.gz
10
+ *.zip
11
+ *.pyc
12
+ deploy.py
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-present Kristy <kristyhoran15@gmail.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasmryzr
3
+ Version: 0.0.1
4
+ Project-URL: Documentation, https://github.com/kristyhoran/datasmryzr#readme
5
+ Project-URL: Issues, https://github.com/kristyhoran/datasmryzr/issues
6
+ Project-URL: Source, https://github.com/kristyhoran/datasmryzr
7
+ Author-email: Kristy Horan <kristyhoran15@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE.txt
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: Implementation :: CPython
16
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: altair
19
+ Requires-Dist: biopython
20
+ Requires-Dist: click
21
+ Requires-Dist: jinja2
22
+ Requires-Dist: mycolorpy
23
+ Requires-Dist: numpy
24
+ Requires-Dist: pandas
25
+ Description-Content-Type: text/markdown
26
+
27
+ # datasmryzr
28
+
29
+ <!-- [![PyPI - Version](https://img.shields.io/pypi/v/datasmryzr.svg)](https://pypi.org/project/datasmryzr)
30
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/datasmryzr.svg)](https://pypi.org/project/datasmryzr) -->
31
+
32
+ ![Python package](https://github.com/kristyhoran/datasmryzr/actions/workflows/python-package.yml/badge.svg)
33
+
34
+ -----
35
+
36
+ Datasmryzr is a small tool that is designed to collate and render pathogen genomics analysis results, such as trees, tables and matrices as a `.html` for sharing and interrogation.
37
+
38
+ ## Table of Contents
39
+
40
+ - [Installation](#installation)
41
+ - [Usage](#usage)
42
+ - [License](#license)
43
+
44
+ ## Installation
45
+
46
+ ```console
47
+ pip install datasmryzr
48
+ ```
49
+ ## Usage
50
+
51
+
52
+ ## License
53
+
54
+ `datasmryzr` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
@@ -0,0 +1,28 @@
1
+ # datasmryzr
2
+
3
+ <!-- [![PyPI - Version](https://img.shields.io/pypi/v/datasmryzr.svg)](https://pypi.org/project/datasmryzr)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/datasmryzr.svg)](https://pypi.org/project/datasmryzr) -->
5
+
6
+ ![Python package](https://github.com/kristyhoran/datasmryzr/actions/workflows/python-package.yml/badge.svg)
7
+
8
+ -----
9
+
10
+ Datasmryzr is a small tool that is designed to collate and render pathogen genomics analysis results, such as trees, tables and matrices as a `.html` for sharing and interrogation.
11
+
12
+ ## Table of Contents
13
+
14
+ - [Installation](#installation)
15
+ - [Usage](#usage)
16
+ - [License](#license)
17
+
18
+ ## Installation
19
+
20
+ ```console
21
+ pip install datasmryzr
22
+ ```
23
+ ## Usage
24
+
25
+
26
+ ## License
27
+
28
+ `datasmryzr` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
@@ -0,0 +1,153 @@
1
+ # Welcome to datasmryzr documentation
2
+
3
+ ![Python package](https://github.com/kristyhoran/datasmryzr/actions/workflows/python-package.yml/badge.svg)
4
+
5
+
6
+ Datasmryzr is a small command-line tool that is designed to collate and render pathogen genomics analysis results, such as trees, tables and matrices as a `.html` for sharing and interactive interrogation.
7
+
8
+ ## Installation
9
+
10
+ `datasmryzr` is a python package with simply dependencies. It is recommended to use a virtual or conda environment for installation.
11
+
12
+ ```
13
+ <activate enviornment>
14
+ pip3 install git+https://github.com/kristyhoran/datasmryzr
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ `datasmryzr` allows users to generate standalone `.html` files for visualisation, searching and sharing of genomic analysis results. It has been designed for pathogen genomics needs, but could also be used for other types of tabular and newick based data.
20
+
21
+ ### Inputs
22
+
23
+ Any combination of tables, newick or vcf can be supplied for generation of the html report. However, there are some things to be aware of in order to get a good result.
24
+
25
+ * Tabular data (comma or tab-delimited files) can be used as input.
26
+ * All columns in input tables will be rendered - in the order that they are supplied.
27
+ * File names will be used as menu labels in the `.html` file - so use sensible ones :wink:
28
+
29
+ * Pairwise matrix data (comma or tab-delimited files) can be supplied to generate distributions and heatmaps of distances.
30
+
31
+ * Newick file - a single newick file can be supplied per report.
32
+
33
+ * An annotation file can also be supplied if you want to be able to add colored annotations to the tree. This file can be one that has been already specified as tabular data or it can be an additional file.
34
+ * The first column in the annotation file MUST contain names that match exactly to the tiplabels in the newick file.
35
+ * If you only want a subset of values from the annotation file, you can supply these as a comma separated list in the command.
36
+ * Only categorical variables can be visualised on the tree at the moment. If you want to display numerical values as categorical, you can provide this information in a configuration file (see below for details).
37
+
38
+ * If you would like to visualise the distribution of variants across the reference genome, you will need to also supply the vcf, with all samples in it (for example the core.vcf output from snippy), a reference genome and a mask file if one has been used.
39
+
40
+ #### Config
41
+
42
+ You can supply a configuration file, with comments and categorical columns (for tree annotation) using `--config` ([see below](#trees))
43
+
44
+ #### Utility options
45
+
46
+ As with most tools, you can supply various options to customise your report
47
+
48
+ * `--output` - this is the path to where you want your report saved. Defaults to current working directory.
49
+ * `--title` - you can give your report a title - this will appear at the top left-hand side of your document.
50
+ * `--description` - you can also add in a brief sentence to describe your report - this will appear as text below the title
51
+ * `--filename` - you can supply a custom name for the output file - defaults to `datasmryzr`
52
+ * `--background_color` - defaults to `#546d78` (blue grey). This is the color used in header, titles and bar graphs.
53
+ * `--font_color` - defaults to `#ffffff`.
54
+
55
+ ## Cookbook
56
+
57
+ ### Simple tables
58
+
59
+ You can generate a html with just tabular data. Any tablular data can be used, csv or tab-delimted files (support for .xlsx coming soon).
60
+
61
+ **Example command**
62
+
63
+ ``` bash
64
+ datasmryzr --title 'A new report' --filename filename1.txt --filename filename2.csv --filename filename3.tsv
65
+ ```
66
+
67
+ ### Trees
68
+
69
+ Commonly pathogen genomics analyses involve the generation of a tree of some sort, which can be challenging to visualise and contextualise with other types of data. In order to generate a report with a tree and annotation, you will need a newick file and file with the data you wish to display on the tree.
70
+
71
+ **Pro-tips**
72
+
73
+ * The first column of the annotation file must contain the tiplabels of the tree you wish to annotate. If a tiplabel is not present in the file, no annotation will be assigned to those tips.
74
+ * Only categorical values will be annotated on the tree. If you have numeric values that should be annotated as categorical on your tree you can supply a configuration file
75
+
76
+ ```json
77
+ {
78
+ "comments":{
79
+ "some_file_stub": "some comment"},
80
+ "datatype": {
81
+ "MLST":"input",
82
+ "ST":"input"
83
+ },
84
+ "categorical_columns": [
85
+ "MLST",
86
+ "ST"
87
+ ]
88
+
89
+ }
90
+ ```
91
+
92
+ * Colours will be randomly assigned - we do not yet support custom colour schemes.
93
+
94
+ **Example command**
95
+
96
+ ```bash
97
+ datasmryzr --tree tree.newick --annotate annotation_file.csv --annotate_cols cat1,cat4,cat5
98
+ ```
99
+
100
+ ### Core genome
101
+
102
+ Sometimes it is useful to see where SNPs are concentrated when you use core genome alignment to a reference genome. If you supply `datasmryzr` with
103
+ 1. vcf file, for example the `core.vcf` output from [snippy](https://www.google.com/search?client=firefox-b-d&q=snippy+github)
104
+ 2. reference genome (fasta format or genbank) used for alignment
105
+ 3. Alignment statisticts, for example the `core.txt` from [snippy](https://www.google.com/search?client=firefox-b-d&q=snippy+github).
106
+ 4. A mask file in bed format (optional - advised if used in analysis as the masked regions will be colored in grey).
107
+
108
+ **Example command**
109
+
110
+ ```bash
111
+ datasmryzr --filename core.txt --core-genome core.vcf -r ref.fa -m mask.bed
112
+ ```
113
+
114
+ ### Distance matrix
115
+
116
+ A very common question that is asked in microbial genomics is 'how far apart these things are', where things can be distances which represent SNPs, alleles or some other feature. If you supply `datasmryzr` with a distance matrix, you can generate heatmap and pairwise dsitributions plots in your report.
117
+
118
+ **Example command**
119
+
120
+ ```bash
121
+ datasmryzr --distance-matrix distances.txt
122
+ ```
123
+
124
+ ## Exploring the html
125
+
126
+ ### The Tree
127
+
128
+ NEED TO ADD IMAGE/VIDEO
129
+
130
+ By deafult if you have supplied a tree, this will be the first view that is loaded.
131
+
132
+ * You can select which columns annotate onto the tree by selecting the `annotate` dropdown menu item and see the lgened by using the `toggle-legend` menu.
133
+ * On large trees you can select internal nodes to zoom to the that subtree.
134
+ * Trees can be exported as newick files or the current view exported as a png.
135
+
136
+ ### Distances
137
+
138
+ NEED TO ADD IMAGE/VIDEO
139
+
140
+ If you have supplied a distance matrix, you will see a `Distance` menu, if you select this, you can choose a grap to display.
141
+ * Graph images can be saved by clicking on the three dots to the right of the image.
142
+ * Rolling over blocks on the heatmap will dispay tooltip with names of pairs and the distances.
143
+
144
+ ### Core genome graph
145
+
146
+ The distribution of SNPs across the reference genome supplied is displayed in bins of SNPS per 5MB. You can use scroll to zoom in on a section of the genome and rollover the bars to see the number of SNPS at each position.
147
+
148
+ ### Other tables
149
+
150
+ * Tablular data can be sorted and searched using the input boxes at the top of each column.
151
+ * Adjust the width of a column by hovering over the edge of the column and dragging it left or right
152
+ * Download the tables for each table. As the creator - obviously this is not that useful to you - but can be useful for collaborators or others that you may share the data with.
153
+
@@ -0,0 +1,31 @@
1
+ site_name: datasmryzr
2
+ site_url: https://kristyhoran.github.io/datasmryzr/
3
+
4
+ theme:
5
+ name: material
6
+ palette:
7
+ primary: blue grey
8
+ icon:
9
+ repo: fontawesome/brands/github-alt
10
+
11
+ features:
12
+ - content.code.copy
13
+ - content.code.select
14
+
15
+ repo_url: https://github.com/kristyhoran/datasmryzr
16
+ repo_name: kristyhoran/datasmryzr
17
+ extra:
18
+ social:
19
+ - icon: fontawesome/brands/github
20
+ link: https://github.com/kristyhoran/datasmryzr
21
+
22
+ markdown_extensions:
23
+ - attr_list
24
+ - pymdownx.emoji:
25
+ emoji_index: !!python/name:material.extensions.emoji.twemoji
26
+ emoji_generator: !!python/name:material.extensions.emoji.to_svg
27
+ - pymdownx.superfences:
28
+ custom_fences:
29
+ - name: mermaid
30
+ class: mermaid
31
+ format: !!python/name:pymdownx.superfences.fence_code_format
@@ -0,0 +1,79 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "datasmryzr"
7
+ dynamic = ["version"]
8
+ description = ''
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ keywords = []
13
+ authors = [
14
+ { name = "Kristy Horan", email = "kristyhoran15@gmail.com" },
15
+ ]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Programming Language :: Python",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: Implementation :: CPython",
23
+ "Programming Language :: Python :: Implementation :: PyPy",
24
+ ]
25
+ dependencies = [
26
+ "click",
27
+ "pandas",
28
+ "jinja2",
29
+ "altair",
30
+ "mycolorpy",
31
+ "numpy",
32
+ "biopython"
33
+ ]
34
+
35
+ [project.urls]
36
+ Documentation = "https://github.com/kristyhoran/datasmryzr#readme"
37
+ Issues = "https://github.com/kristyhoran/datasmryzr/issues"
38
+ Source = "https://github.com/kristyhoran/datasmryzr"
39
+
40
+
41
+ [project.scripts]
42
+ datasmryzr = "datasmryzr.datasmryzr:smryzr"
43
+
44
+
45
+ [tool.hatch.version]
46
+ path = "src/datasmryzr/__about__.py"
47
+
48
+ [tool.hatch.envs.types]
49
+ extra-dependencies = [
50
+ "mypy>=1.0.0",
51
+ "click",
52
+ "pandas",
53
+ "jinja2",
54
+ "altair",
55
+ "mycolorpy",
56
+ "numpy",
57
+ "biopython"
58
+ ]
59
+ [tool.hatch.envs.types.scripts]
60
+ check = "mypy --install-types --non-interactive {args:src/datasmryzr tests}"
61
+
62
+ [tool.coverage.run]
63
+ source_pkgs = ["datasmryzr", "tests"]
64
+ branch = true
65
+ parallel = true
66
+ omit = [
67
+ "src/datasmryzr/__about__.py",
68
+ ]
69
+
70
+ [tool.coverage.paths]
71
+ datasmryzr = ["src/datasmryzr", "*/datasmryzr/src/datasmryzr","src/datasmryzr/templates/*"]
72
+ tests = ["tests", "*/datasmryzr/tests"]
73
+
74
+ [tool.coverage.report]
75
+ exclude_lines = [
76
+ "no cov",
77
+ "if __name__ == .__main__.:",
78
+ "if TYPE_CHECKING:",
79
+ ]
@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: 2025-present Kristy <kristyhoran15@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+ __version__ = "0.0.1"
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: 2025-present Kristy <kristyhoran15@gmail.com>
2
+ #
3
+ # SPDX-License-Identifier: MIT