paraphrase 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paraphrase-0.1.0/.github/CODEOWNERS +5 -0
- paraphrase-0.1.0/.github/pull_request_template.md +13 -0
- paraphrase-0.1.0/.github/workflows/keep_a_changelog.yml +15 -0
- paraphrase-0.1.0/.github/workflows/linting.yml +13 -0
- paraphrase-0.1.0/.github/workflows/release.yml +21 -0
- paraphrase-0.1.0/.github/workflows/test.yml +38 -0
- paraphrase-0.1.0/.gitignore +11 -0
- paraphrase-0.1.0/.pre-commit-config.yaml +18 -0
- paraphrase-0.1.0/.prettierignore +1 -0
- paraphrase-0.1.0/.python-version +1 -0
- paraphrase-0.1.0/CHANGELOG.md +25 -0
- paraphrase-0.1.0/LICENCE +21 -0
- paraphrase-0.1.0/PKG-INFO +74 -0
- paraphrase-0.1.0/README.md +61 -0
- paraphrase-0.1.0/pyproject.toml +42 -0
- paraphrase-0.1.0/src/__init__.py +3 -0
- paraphrase-0.1.0/src/paraphrase/__init__.py +1 -0
- paraphrase-0.1.0/src/paraphrase/config.py +17 -0
- paraphrase-0.1.0/src/paraphrase/constants.py +34 -0
- paraphrase-0.1.0/src/paraphrase/exceptions.py +28 -0
- paraphrase-0.1.0/src/paraphrase/io.py +108 -0
- paraphrase-0.1.0/src/paraphrase/main.py +92 -0
- paraphrase-0.1.0/src/paraphrase/pipeline.py +26 -0
- paraphrase-0.1.0/src/paraphrase/processors.py +145 -0
- paraphrase-0.1.0/src/paraphrase/rules_engine.py +251 -0
- paraphrase-0.1.0/src/paraphrase/utils.py +0 -0
- paraphrase-0.1.0/test-data/HG002.paraphase.json +139513 -0
- paraphrase-0.1.0/test-data/HG003.paraphase.json +135075 -0
- paraphrase-0.1.0/test-data/HG004.paraphase.json +140297 -0
- paraphrase-0.1.0/test-data/rules.yaml +64 -0
- paraphrase-0.1.0/tests/test_io_tsv.py +31 -0
- paraphrase-0.1.0/tests/test_processors.py +137 -0
- paraphrase-0.1.0/tests/test_rules_engine.py +63 -0
- paraphrase-0.1.0/tests/test_rules_not_empty.py +18 -0
- paraphrase-0.1.0/uv.lock +335 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
name: "Changelog Reminder"
|
|
2
|
+
on:
|
|
3
|
+
pull_request:
|
|
4
|
+
types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
|
|
5
|
+
|
|
6
|
+
jobs:
|
|
7
|
+
# Enforces the update of a changelog file on every pull request
|
|
8
|
+
changelog:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v6
|
|
12
|
+
- uses: dangoslen/changelog-enforcer@v3
|
|
13
|
+
with:
|
|
14
|
+
changeLogPath: "CHANGELOG.md"
|
|
15
|
+
skipLabels: "Skip-Changelog"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- v*
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
pypi:
|
|
10
|
+
name: Build and publish to PyPI
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
# Environment and permissions trusted publishing.
|
|
13
|
+
environment:
|
|
14
|
+
name: release
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v6
|
|
19
|
+
- uses: astral-sh/setup-uv@v7
|
|
20
|
+
- run: uv build
|
|
21
|
+
- run: uv publish --trusted-publishing always
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
pull_request:
|
|
8
|
+
branches:
|
|
9
|
+
- main
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
test:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v6
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v7
|
|
20
|
+
with:
|
|
21
|
+
version: "latest"
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v6
|
|
25
|
+
with:
|
|
26
|
+
python-version-file: ".python-version"
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: uv sync
|
|
30
|
+
|
|
31
|
+
- name: Run tests with coverage
|
|
32
|
+
run: uv run pytest --cov=src --cov-report=xml --cov-report=term
|
|
33
|
+
|
|
34
|
+
- name: Upload coverage to Codecov
|
|
35
|
+
uses: codecov/codecov-action@v4
|
|
36
|
+
with:
|
|
37
|
+
file: ./coverage.xml
|
|
38
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/mirrors-prettier
|
|
3
|
+
rev: "v3.1.0"
|
|
4
|
+
hooks:
|
|
5
|
+
- id: prettier
|
|
6
|
+
additional_dependencies:
|
|
7
|
+
- prettier
|
|
8
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
9
|
+
rev: v6.0.0
|
|
10
|
+
hooks:
|
|
11
|
+
- id: trailing-whitespace
|
|
12
|
+
args: [--markdown-linebreak-ext=md]
|
|
13
|
+
- id: end-of-file-fixer
|
|
14
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
15
|
+
rev: v0.15.2
|
|
16
|
+
hooks:
|
|
17
|
+
- id: ruff-check
|
|
18
|
+
- id: ruff-format
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
test-data/HG*
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## v0.1.0 [2026-02-24]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- [#2](https://github.com/Clinical-Genomics/paraphrase/pull/2) - LICENCE
|
|
13
|
+
- [#2](https://github.com/Clinical-Genomics/paraphrase/pull/2) - Paraphase JSON parsing
|
|
14
|
+
- [#2](https://github.com/Clinical-Genomics/paraphrase/pull/2) - Testing framework
|
|
15
|
+
- [#3](https://github.com/Clinical-Genomics/paraphrase/pull/3) - CHANGELOG and reminder workflow
|
|
16
|
+
- [#3](https://github.com/Clinical-Genomics/paraphrase/pull/3) - CODEOWNERS and PR template
|
|
17
|
+
- [#4](https://github.com/Clinical-Genomics/paraphrase/pull/4) - Paraphase test data
|
|
18
|
+
- [#7](https://github.com/Clinical-Genomics/paraphrase/pull/7) - Release workflow
|
|
19
|
+
- [#11](https://github.com/Clinical-Genomics/paraphrase/pull/11) - Pre-commit hooks and GitHub actions workflow for prettier and ruff
|
|
20
|
+
|
|
21
|
+
### Changed
|
|
22
|
+
|
|
23
|
+
- [#8](https://github.com/Clinical-Genomics/paraphrase/pull/8) - Phase regions to output
|
|
24
|
+
- [#9](https://github.com/Clinical-Genomics/paraphrase/pull/9) - Rework nested separators for TSV output
|
|
25
|
+
- [#10](https://github.com/Clinical-Genomics/paraphrase/pull/10) - Updated CODEOWNERS
|
paraphrase-0.1.0/LICENCE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Felix Lenner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paraphrase
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Parse paraphase JSONs
|
|
5
|
+
Author-email: Felix Lenner <felix.lenner@scilifelab.se>
|
|
6
|
+
License: MIT License
|
|
7
|
+
License-File: LICENCE
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Requires-Dist: coloredlogs>=15.0.1
|
|
10
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
11
|
+
Requires-Dist: typer>=0.20.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# Paraphrase
|
|
15
|
+
|
|
16
|
+
Parse paraphase JSONs.
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
Usage: paraphrase [OPTIONS]
|
|
22
|
+
|
|
23
|
+
Parse paraphase JSONs.
|
|
24
|
+
|
|
25
|
+
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
26
|
+
│ * --input -f FILE Input JSON files (can be multiple) [required] │
|
|
27
|
+
│ * --sample -s TEXT Sample names corresponding to input JSON files [required] │
|
|
28
|
+
│ --rules -r FILE Optional YAML file with per-gene classification rules (adds 'status' fields) │
|
|
29
|
+
│ --skip-keys TEXT Comma-separated keys to skip (e.g. region_depth,final_haplotypes) │
|
|
30
|
+
│ --genes TEXT Optional comma-separated list of gene names to process │
|
|
31
|
+
│ --output-format -o TEXT Output format: 'json' (default) or 'tsv' [default: json] │
|
|
32
|
+
│ --help Show this message and exit. │
|
|
33
|
+
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Example command:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
uv run paraphrase \
|
|
40
|
+
--input test-data/HG002.paraphase.json \
|
|
41
|
+
--input test-data/HG003.paraphase.json \
|
|
42
|
+
--input test-data/HG004.paraphase.json \
|
|
43
|
+
--sample HG002 \
|
|
44
|
+
--sample HG003 \
|
|
45
|
+
--sample HG004 \
|
|
46
|
+
--rules test-data/rules.yaml \
|
|
47
|
+
--genes CFH,CFHR3,f8,GBA,hba,ikbkg,ncf1,neb,opn1lw,pms2,rccx,smn1,strc
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Rules YAML (per-gene status classification)
|
|
51
|
+
|
|
52
|
+
Rules are evaluated per gene. Conditions within a single `when` mapping are
|
|
53
|
+
combined with logical AND (all must be true). If multiple rules match, you can
|
|
54
|
+
provide a `status_order` to pick the most severe (last wins by order).
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
|
|
58
|
+
```yaml
|
|
59
|
+
smn1:
|
|
60
|
+
# Optional; defaults to "normal" if omitted
|
|
61
|
+
default_status: normal
|
|
62
|
+
# Optional; defaults to [normal, intermediate, pathological] if omitted
|
|
63
|
+
status_order: [normal, intermediate, pathological]
|
|
64
|
+
rules:
|
|
65
|
+
- status: intermediate
|
|
66
|
+
when:
|
|
67
|
+
smn1_cn: 0
|
|
68
|
+
smn2_cn: { ">=": 4 }
|
|
69
|
+
reason: "SMN1 deleted but SMN2 high"
|
|
70
|
+
- status: pathological
|
|
71
|
+
when:
|
|
72
|
+
smn1_cn: { "<": 2 }
|
|
73
|
+
reason: "SMN1 copy number low"
|
|
74
|
+
```
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Paraphrase
|
|
2
|
+
|
|
3
|
+
Parse paraphase JSONs.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Usage: paraphrase [OPTIONS]
|
|
9
|
+
|
|
10
|
+
Parse paraphase JSONs.
|
|
11
|
+
|
|
12
|
+
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
13
|
+
│ * --input -f FILE Input JSON files (can be multiple) [required] │
|
|
14
|
+
│ * --sample -s TEXT Sample names corresponding to input JSON files [required] │
|
|
15
|
+
│ --rules -r FILE Optional YAML file with per-gene classification rules (adds 'status' fields) │
|
|
16
|
+
│ --skip-keys TEXT Comma-separated keys to skip (e.g. region_depth,final_haplotypes) │
|
|
17
|
+
│ --genes TEXT Optional comma-separated list of gene names to process │
|
|
18
|
+
│ --output-format -o TEXT Output format: 'json' (default) or 'tsv' [default: json] │
|
|
19
|
+
│ --help Show this message and exit. │
|
|
20
|
+
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Example command:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
uv run paraphrase \
|
|
27
|
+
--input test-data/HG002.paraphase.json \
|
|
28
|
+
--input test-data/HG003.paraphase.json \
|
|
29
|
+
--input test-data/HG004.paraphase.json \
|
|
30
|
+
--sample HG002 \
|
|
31
|
+
--sample HG003 \
|
|
32
|
+
--sample HG004 \
|
|
33
|
+
--rules test-data/rules.yaml \
|
|
34
|
+
--genes CFH,CFHR3,f8,GBA,hba,ikbkg,ncf1,neb,opn1lw,pms2,rccx,smn1,strc
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Rules YAML (per-gene status classification)
|
|
38
|
+
|
|
39
|
+
Rules are evaluated per gene. Conditions within a single `when` mapping are
|
|
40
|
+
combined with logical AND (all must be true). If multiple rules match, you can
|
|
41
|
+
provide a `status_order` to pick the most severe (last wins by order).
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
|
|
45
|
+
```yaml
|
|
46
|
+
smn1:
|
|
47
|
+
# Optional; defaults to "normal" if omitted
|
|
48
|
+
default_status: normal
|
|
49
|
+
# Optional; defaults to [normal, intermediate, pathological] if omitted
|
|
50
|
+
status_order: [normal, intermediate, pathological]
|
|
51
|
+
rules:
|
|
52
|
+
- status: intermediate
|
|
53
|
+
when:
|
|
54
|
+
smn1_cn: 0
|
|
55
|
+
smn2_cn: { ">=": 4 }
|
|
56
|
+
reason: "SMN1 deleted but SMN2 high"
|
|
57
|
+
- status: pathological
|
|
58
|
+
when:
|
|
59
|
+
smn1_cn: { "<": 2 }
|
|
60
|
+
reason: "SMN1 copy number low"
|
|
61
|
+
```
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "paraphrase"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Parse paraphase JSONs"
|
|
5
|
+
authors = [{name="Felix Lenner", email="felix.lenner@scilifelab.se"}]
|
|
6
|
+
license = {text = "MIT License"}
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
requires-python = ">=3.12"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"coloredlogs>=15.0.1",
|
|
11
|
+
"pyyaml>=6.0.3",
|
|
12
|
+
"typer>=0.20.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
paraphrase = "paraphrase.main:app"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["hatchling"]
|
|
20
|
+
build-backend = "hatchling.build"
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest-cov>=7.0.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.pytest.ini_options]
|
|
28
|
+
testpaths = ["tests"]
|
|
29
|
+
addopts = "--cov=src --cov-report=term-missing --cov-report=html"
|
|
30
|
+
|
|
31
|
+
[tool.coverage.run]
|
|
32
|
+
source = ["src"]
|
|
33
|
+
omit = ["*/tests/*", "*/__pycache__/*"]
|
|
34
|
+
|
|
35
|
+
[tool.coverage.report]
|
|
36
|
+
exclude_lines = [
|
|
37
|
+
"pragma: no cover",
|
|
38
|
+
"def __repr__",
|
|
39
|
+
"raise AssertionError",
|
|
40
|
+
"raise NotImplementedError",
|
|
41
|
+
"if __name__ == .__main__.:",
|
|
42
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import main as main
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Optional, Set, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ProcessingConfig:
|
|
7
|
+
"""
|
|
8
|
+
Configuration for how to process paraphase JSONs.
|
|
9
|
+
|
|
10
|
+
- skip_keys: keys under each gene that should be ignored entirely.
|
|
11
|
+
- genes_list: optional list of genes to keep; if None, keep all.
|
|
12
|
+
- rules: optional per-gene classification rules YAML structure.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
skip_keys: Set[str]
|
|
16
|
+
genes_list: Optional[List[str]] = None
|
|
17
|
+
rules: Optional[Dict[str, Any]] = None
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
DEFAULT_SKIP_KEYS: frozenset[str] = frozenset(
|
|
2
|
+
[
|
|
3
|
+
"alleles_all_haplotypes",
|
|
4
|
+
"assembled_haplotypes",
|
|
5
|
+
"del_read_number",
|
|
6
|
+
"directional_links",
|
|
7
|
+
"final_haplotypes",
|
|
8
|
+
"first_copies",
|
|
9
|
+
"flanking_summary",
|
|
10
|
+
"gene_reads",
|
|
11
|
+
"haplotype_details",
|
|
12
|
+
"haplotype_links",
|
|
13
|
+
"het_sites_not_used_in_phasing",
|
|
14
|
+
"heterozygous_sites",
|
|
15
|
+
"highest_total_cn",
|
|
16
|
+
"homozygous_sites",
|
|
17
|
+
"intergenic_depth",
|
|
18
|
+
"last_copies",
|
|
19
|
+
"links_loose",
|
|
20
|
+
"linked_haplotypes",
|
|
21
|
+
"middle_copies",
|
|
22
|
+
"nonunique_supporting_reads",
|
|
23
|
+
"phasing_success",
|
|
24
|
+
"pseudo_reads",
|
|
25
|
+
"raw_alleles",
|
|
26
|
+
"read_details",
|
|
27
|
+
"sample_sex",
|
|
28
|
+
"smn1_read_number",
|
|
29
|
+
"smn2_read_number",
|
|
30
|
+
"smn_del78_read_number",
|
|
31
|
+
"sites_for_phasing",
|
|
32
|
+
"unique_supporting_reads",
|
|
33
|
+
]
|
|
34
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
class InputMismatchError(Exception):
|
|
2
|
+
"""Raised when the number of input files does not match the number of sample names."""
|
|
3
|
+
|
|
4
|
+
pass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class YAMLLoadError(Exception):
|
|
8
|
+
"""Raised when a YAML file cannot be loaded."""
|
|
9
|
+
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JSONLoadError(Exception):
|
|
14
|
+
"""Raised when a JSON file cannot be loaded."""
|
|
15
|
+
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InvalidOperatorError(Exception):
|
|
20
|
+
"""Raised when an invalid operator is encountered in normal value processing."""
|
|
21
|
+
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ListOpNotSupportedError(Exception):
|
|
26
|
+
"""Raised when an unsupported operation is attempted on a list."""
|
|
27
|
+
|
|
28
|
+
pass
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import coloredlogs
|
|
3
|
+
import yaml
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from .exceptions import JSONLoadError, YAMLLoadError
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
coloredlogs.install(level="INFO")
|
|
10
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_yaml(file: Path):
|
|
15
|
+
try:
|
|
16
|
+
with file.open("r", encoding="utf-8") as f:
|
|
17
|
+
return yaml.safe_load(f)
|
|
18
|
+
except Exception as e:
|
|
19
|
+
raise YAMLLoadError(f"Failed to read YAML file {file}: {e}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_json(file: Path):
|
|
23
|
+
"""Load one JSON file and process its contents."""
|
|
24
|
+
try:
|
|
25
|
+
with file.open("r", encoding="utf-8") as f:
|
|
26
|
+
return json.load(f)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise JSONLoadError(f"Failed to read JSON file {file}: {e}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def print_tsv(json_data: Dict) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Print results in TSV format.
|
|
34
|
+
"""
|
|
35
|
+
print("sample\tlocus\tstatus\tmetric\tvalue")
|
|
36
|
+
|
|
37
|
+
for sample, locus in json_data.items():
|
|
38
|
+
for locus, locus_info in locus.items():
|
|
39
|
+
# Gene-level status comes from the rules engine; if no rules were
|
|
40
|
+
# defined or no status was set, report it as "unknown"
|
|
41
|
+
locus_status = locus_info.get("status")
|
|
42
|
+
if not isinstance(locus_status, str):
|
|
43
|
+
locus_status = "unknown"
|
|
44
|
+
|
|
45
|
+
# Iterate over locus information, e.g. region_depth, final_haplotypes, etc.
|
|
46
|
+
for locus_metric, locus_metric_value in locus_info.items():
|
|
47
|
+
# Do not emit the per-gene status or rule-match metadata as separate rows
|
|
48
|
+
if locus_metric in {"status", "status_matches"}:
|
|
49
|
+
continue
|
|
50
|
+
prettified_value = stringify_value(locus_metric_value)
|
|
51
|
+
print(
|
|
52
|
+
f"{sample}\t{locus}\t{locus_status}\t{locus_metric}\t{prettified_value}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def stringify_value(content) -> str | None:
|
|
57
|
+
"""
|
|
58
|
+
Flatten nested dicts/lists for TSV output.
|
|
59
|
+
|
|
60
|
+
Rules:
|
|
61
|
+
- Skip empty or None values.
|
|
62
|
+
- Dicts: key:value for top-level, key=subkey=subvalue for nested dicts
|
|
63
|
+
- Lists:
|
|
64
|
+
* Lists of simple values (str, int, float) are joined by commas.
|
|
65
|
+
* Lists of lists -> join inner list by '|', then outer by ','
|
|
66
|
+
"""
|
|
67
|
+
if content in (None, [], {}):
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
if isinstance(content, (int, float, str)):
|
|
71
|
+
return str(content)
|
|
72
|
+
|
|
73
|
+
if isinstance(content, list):
|
|
74
|
+
# If it's a list of simple values, join by commas
|
|
75
|
+
if all(not isinstance(item, list) for item in content):
|
|
76
|
+
flat = [str(element) for element in content if element is not None]
|
|
77
|
+
return ",".join(flat) if flat else None
|
|
78
|
+
else:
|
|
79
|
+
# Lists with sublists
|
|
80
|
+
flat = []
|
|
81
|
+
for item in content:
|
|
82
|
+
if isinstance(item, list):
|
|
83
|
+
if inner := [
|
|
84
|
+
str(element) for element in item if element is not None
|
|
85
|
+
]:
|
|
86
|
+
flat.append("|".join(inner))
|
|
87
|
+
elif item is not None:
|
|
88
|
+
flat.append(str(item))
|
|
89
|
+
return ",".join(flat) if flat else None
|
|
90
|
+
|
|
91
|
+
if isinstance(content, dict):
|
|
92
|
+
flat = []
|
|
93
|
+
for key, value in content.items():
|
|
94
|
+
if value in (None, [], {}):
|
|
95
|
+
continue
|
|
96
|
+
if isinstance(value, dict):
|
|
97
|
+
sub_items = []
|
|
98
|
+
for subkey, subvalue in value.items():
|
|
99
|
+
if prettified_subvalue := stringify_value(subvalue):
|
|
100
|
+
sub_items.append(f"{subkey}={prettified_subvalue}")
|
|
101
|
+
if sub_items:
|
|
102
|
+
flat.append(f"{key}:{'|'.join(sub_items)}")
|
|
103
|
+
else:
|
|
104
|
+
if prettified_string := stringify_value(value):
|
|
105
|
+
flat.append(f"{key}:{prettified_string}")
|
|
106
|
+
return ",".join(flat) if flat else None
|
|
107
|
+
|
|
108
|
+
return str(content)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
import typer
|
|
6
|
+
from .constants import DEFAULT_SKIP_KEYS
|
|
7
|
+
from .pipeline import merge_and_process, assert_equal_inputs_and_samples
|
|
8
|
+
from .io import load_json, load_yaml, print_tsv
|
|
9
|
+
from .exceptions import InputMismatchError
|
|
10
|
+
from .config import ProcessingConfig
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(
|
|
13
|
+
rich_markup_mode="rich",
|
|
14
|
+
invoke_without_command=True,
|
|
15
|
+
pretty_exceptions_show_locals=False,
|
|
16
|
+
add_completion=False,
|
|
17
|
+
help="Call coverage drops over alleles in STR VCFs",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def main(
|
|
23
|
+
input: List[Path] = typer.Option(
|
|
24
|
+
...,
|
|
25
|
+
"--input",
|
|
26
|
+
"-f",
|
|
27
|
+
exists=True,
|
|
28
|
+
file_okay=True,
|
|
29
|
+
dir_okay=False,
|
|
30
|
+
help="Input JSON files (can be multiple)",
|
|
31
|
+
),
|
|
32
|
+
sample: List[str] = typer.Option(
|
|
33
|
+
...,
|
|
34
|
+
"--sample",
|
|
35
|
+
"-s",
|
|
36
|
+
help="Sample names corresponding to input JSON files",
|
|
37
|
+
),
|
|
38
|
+
rules_yaml: Optional[Path] = typer.Option(
|
|
39
|
+
None,
|
|
40
|
+
"--rules",
|
|
41
|
+
"-r",
|
|
42
|
+
exists=True,
|
|
43
|
+
file_okay=True,
|
|
44
|
+
dir_okay=False,
|
|
45
|
+
help="Optional YAML file with per-gene classification rules (adds 'status' fields)",
|
|
46
|
+
),
|
|
47
|
+
skip_keys: str = typer.Option(
|
|
48
|
+
None, help="Comma-separated keys to skip (e.g. region_depth,final_haplotypes)"
|
|
49
|
+
),
|
|
50
|
+
genes: Optional[str] = typer.Option(
|
|
51
|
+
None, help="Optional comma-separated list of gene names to process"
|
|
52
|
+
),
|
|
53
|
+
output_format: str = typer.Option(
|
|
54
|
+
"json", "--output-format", "-o", help="Output format: 'json' (default) or 'tsv'"
|
|
55
|
+
),
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Parse paraphase JSONs.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
# Parse and validate parameters
|
|
62
|
+
skip_keys_list = (
|
|
63
|
+
[k.strip() for k in skip_keys.split(",")]
|
|
64
|
+
if skip_keys
|
|
65
|
+
else list(DEFAULT_SKIP_KEYS)
|
|
66
|
+
)
|
|
67
|
+
genes_list = [g.strip().lower() for g in genes.split(",")] if genes else None
|
|
68
|
+
assert_equal_inputs_and_samples(input, sample)
|
|
69
|
+
|
|
70
|
+
# Get input files
|
|
71
|
+
rules = load_yaml(rules_yaml) if rules_yaml else None
|
|
72
|
+
json_data_list = [load_json(f) for f in input]
|
|
73
|
+
|
|
74
|
+
config = ProcessingConfig(
|
|
75
|
+
skip_keys=set(skip_keys_list),
|
|
76
|
+
genes_list=genes_list,
|
|
77
|
+
rules=rules,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
merged_data = merge_and_process(json_data_list, sample, config)
|
|
81
|
+
|
|
82
|
+
if output_format.lower() == "tsv":
|
|
83
|
+
print_tsv(merged_data)
|
|
84
|
+
else:
|
|
85
|
+
typer.echo(json.dumps(merged_data, indent=2))
|
|
86
|
+
except InputMismatchError as e:
|
|
87
|
+
typer.echo(f"[error] {e}", err=True)
|
|
88
|
+
raise typer.Exit(code=1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
app()
|