hponorm 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hponorm-1.0.0/LICENSE +21 -0
- hponorm-1.0.0/PKG-INFO +10 -0
- hponorm-1.0.0/README.md +156 -0
- hponorm-1.0.0/hponorm/cli.py +390 -0
- hponorm-1.0.0/hponorm/hpo.py +317 -0
- hponorm-1.0.0/hponorm/ols.py +112 -0
- hponorm-1.0.0/hponorm/phenopackets.py +86 -0
- hponorm-1.0.0/hponorm/suggest.py +75 -0
- hponorm-1.0.0/hponorm.egg-info/PKG-INFO +10 -0
- hponorm-1.0.0/hponorm.egg-info/SOURCES.txt +14 -0
- hponorm-1.0.0/hponorm.egg-info/dependency_links.txt +1 -0
- hponorm-1.0.0/hponorm.egg-info/entry_points.txt +2 -0
- hponorm-1.0.0/hponorm.egg-info/requires.txt +4 -0
- hponorm-1.0.0/hponorm.egg-info/top_level.txt +1 -0
- hponorm-1.0.0/pyproject.toml +22 -0
- hponorm-1.0.0/setup.cfg +4 -0
hponorm-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Tim Hearn 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
hponorm-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hponorm
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Interactively validate and normalise GA4GH phenopacket phenotypic-feature terms against the HPO ontology
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Provides-Extra: recommended
|
|
8
|
+
Requires-Dist: rapidfuzz>=3.0; extra == "recommended"
|
|
9
|
+
Requires-Dist: rich>=13.0; extra == "recommended"
|
|
10
|
+
Dynamic: license-file
|
hponorm-1.0.0/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# hponorm
|
|
2
|
+
|
|
3
|
+
Interactively **validate and normalise the phenotypic-feature terms** in GA4GH
|
|
4
|
+
phenopacket / family JSON files against the **HPO** (Human Phenotype Ontology).
|
|
5
|
+
|
|
6
|
+
It's the sibling of `mondonorm` (which does disease terms → MONDO) and shares the
|
|
7
|
+
same look and feel. This one targets `phenotypicFeatures[].type` and HPO.
|
|
8
|
+
|
|
9
|
+
Your generator currently emits phenotype terms with an empty `id` and a
|
|
10
|
+
free-text label:
|
|
11
|
+
|
|
12
|
+
```json
|
|
13
|
+
"phenotypicFeatures": [ { "type": { "id": "", "label": "long qt interval" } } ]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
`hponorm` walks the file, finds every phenotypic feature, suggests the most
|
|
17
|
+
likely HPO terms for each label, lets you pick one (or type an HP id yourself),
|
|
18
|
+
and writes a normalised copy:
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
"phenotypicFeatures": [ { "type": { "id": "HP:0001657", "label": "Prolonged QT interval" } } ]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## How suggestions are sourced
|
|
25
|
+
|
|
26
|
+
Two interchangeable backends; pick with `--backend`:
|
|
27
|
+
|
|
28
|
+
| Backend | What it uses | Network |
|
|
29
|
+
|---------|--------------|---------|
|
|
30
|
+
| `ols` | EBI Ontology Lookup Service v4 (live HPO) | required |
|
|
31
|
+
| `local` | A local `hp.obo` / `hp.json`, downloaded + cached on first use | only for the one-time download |
|
|
32
|
+
| `auto` | OLS if reachable, otherwise local (the default) | preferred |
|
|
33
|
+
|
|
34
|
+
The official ontology file can be fetched automatically (from
|
|
35
|
+
`purl.obolibrary.org/obo/hp.obo`) or supplied with `--hpo-file`.
|
|
36
|
+
A tiny **illustrative** offline sample (`hponorm/data/hpo-sample.obo`, 8 terms)
|
|
37
|
+
ships with the package so you can try the tool with no network — it is **not** a
|
|
38
|
+
substitute for the real ontology.
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e . # installs the `hponorm` command
|
|
44
|
+
pip install -e ".[recommended]" # + rapidfuzz (better matching) and rich (nicer UI)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Both extras are optional: without `rapidfuzz` it falls back to stdlib `difflib`;
|
|
48
|
+
without `rich` it prints plain text.
|
|
49
|
+
|
|
50
|
+
## Quick start
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Online (auto-detects OLS):
|
|
54
|
+
hponorm test3_phenopackets.json
|
|
55
|
+
|
|
56
|
+
# Fully offline against a downloaded ontology:
|
|
57
|
+
hponorm myfile.json --backend local --hpo-file /path/to/hp.obo
|
|
58
|
+
|
|
59
|
+
# Try it offline with the bundled sample (diabetes / long QT / a few others):
|
|
60
|
+
hponorm myfile.json --backend local --hpo-file hponorm/data/hpo-sample.obo
|
|
61
|
+
|
|
62
|
+
# Equivalent without installing:
|
|
63
|
+
python -m hponorm myfile.json
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## The interactive review
|
|
67
|
+
|
|
68
|
+
Each **distinct** label is reviewed once and the decision is applied to every
|
|
69
|
+
occurrence (so "diabetes" appearing 8 times is one question, not eight). For
|
|
70
|
+
each label you see a ranked table and a prompt:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
Select [#, s words, h HP:id, r, k, x, ?, q]:
|
|
74
|
+
<number> select that suggestion
|
|
75
|
+
s <words> search again with different words
|
|
76
|
+
h HP:id enter an HPO id manually (validated, canonical label fetched)
|
|
77
|
+
r reuse a decision remembered from earlier in this session
|
|
78
|
+
k keep the current term unchanged
|
|
79
|
+
x skip this label (leave it unmapped)
|
|
80
|
+
? help
|
|
81
|
+
q finish now: apply decisions made so far and save
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
If a feature already has an `id`, it is validated: the tool reports whether the
|
|
85
|
+
id resolves in HPO and whether its canonical label matches the existing label.
|
|
86
|
+
A feature's `excluded` flag (a negated phenotype) is preserved untouched — only
|
|
87
|
+
`type.id` / `type.label` are changed.
|
|
88
|
+
|
|
89
|
+
## Output
|
|
90
|
+
|
|
91
|
+
For `input.json` (unless you pass `--out` or `--in-place`):
|
|
92
|
+
|
|
93
|
+
* `input.normalized.json` — the normalised phenopacket (only `type.id` /
|
|
94
|
+
`type.label` are changed; everything else, including the pedigree and any
|
|
95
|
+
`diseases` terms, is left exactly as-is).
|
|
96
|
+
* `input.normalized.json.mapping.json` — a decision log (label → HP id, count).
|
|
97
|
+
|
|
98
|
+
Because `hponorm` only touches `phenotypicFeatures`, you can run it alongside
|
|
99
|
+
`mondonorm` (which only touches `diseases`) on the same file without conflict.
|
|
100
|
+
|
|
101
|
+
## Useful options
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
--out PATH output path (single input file only)
|
|
105
|
+
--in-place overwrite the input file(s)
|
|
106
|
+
--backend {auto,ols,local}
|
|
107
|
+
--hpo-file PATH local hp.obo or hp.json
|
|
108
|
+
--cache-dir DIR where the downloaded HPO + parsed index are cached
|
|
109
|
+
--update re-download / re-parse HPO
|
|
110
|
+
--no-download never download (local backend must find a file)
|
|
111
|
+
--no-online never use OLS, even in auto mode
|
|
112
|
+
--limit N suggestions shown per label (default 8)
|
|
113
|
+
--mapping FILE load/save remembered label->term decisions across runs
|
|
114
|
+
--auto-remembered auto-apply remembered labels without prompting
|
|
115
|
+
--no-color plain text output
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Process several files in one go; remembered decisions carry across them:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
hponorm *.json --mapping team-hpo-map.json
|
|
122
|
+
# next time, reuse without re-typing:
|
|
123
|
+
hponorm new/*.json --mapping team-hpo-map.json --auto-remembered
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Use as a library
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from hponorm import HpoIndex, OlsClient, Suggester
|
|
130
|
+
from hponorm import phenopackets as pp
|
|
131
|
+
|
|
132
|
+
sug = Suggester(OlsClient(), name="ols") # or Suggester(HpoIndex.load(...))
|
|
133
|
+
for c in sug.suggest("long qt interval", limit=5):
|
|
134
|
+
print(c.id, c.label, c.score)
|
|
135
|
+
|
|
136
|
+
data = pp.load("myfile.json")
|
|
137
|
+
for ref in pp.find_phenotypic_features(data):
|
|
138
|
+
print(ref.path, ref.label, ref.id, "excluded" if ref.excluded else "")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Getting the full HPO ontology
|
|
142
|
+
|
|
143
|
+
* Browser / search: https://hpo.jax.org
|
|
144
|
+
* Direct files: `http://purl.obolibrary.org/obo/hp.obo` or `.../hp.json`
|
|
145
|
+
* Releases: https://github.com/obophenotype/human-phenotype-ontology/releases
|
|
146
|
+
|
|
147
|
+
Phenopackets reference: https://phenopacket-schema.readthedocs.io
|
|
148
|
+
(phenotypic features use HPO `OntologyClass` values in `PhenotypicFeature.type`).
|
|
149
|
+
|
|
150
|
+
## A note on the bundled sample's ids
|
|
151
|
+
|
|
152
|
+
The HP ids in `hpo-sample.obo` were checked against HPO browsers (e.g. Diabetes
|
|
153
|
+
mellitus HP:0000819, Prolonged QT interval HP:0001657, Seizure HP:0001250,
|
|
154
|
+
Microcephaly HP:0000252, Short stature HP:0004322, Global developmental delay
|
|
155
|
+
HP:0001263). It is still only a tiny demo fixture — verify against the live HPO
|
|
156
|
+
for real curation work.
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""Interactive command-line tool to validate and normalise the phenotypic
|
|
2
|
+
feature terms in GA4GH phenopacket / family JSON files against the HPO ontology.
|
|
3
|
+
|
|
4
|
+
Run: python -m hponorm FILE.json [FILE2.json ...]
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from . import phenopackets as pp
|
|
13
|
+
from .hpo import HpoIndex, normalise_curie
|
|
14
|
+
from .ols import OlsClient
|
|
15
|
+
from .suggest import Candidate, Suggester
|
|
16
|
+
|
|
17
|
+
# --------------------------------------------------------------------- pretty
|
|
18
|
+
try:
|
|
19
|
+
from rich.console import Console
|
|
20
|
+
from rich.table import Table
|
|
21
|
+
|
|
22
|
+
_console = Console()
|
|
23
|
+
_HAVE_RICH = True
|
|
24
|
+
except Exception: # pragma: no cover
|
|
25
|
+
_console = None
|
|
26
|
+
_HAVE_RICH = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class UI:
|
|
30
|
+
def __init__(self, color: bool = True):
|
|
31
|
+
self.rich = _HAVE_RICH and color
|
|
32
|
+
|
|
33
|
+
def rule(self, text: str = "") -> None:
|
|
34
|
+
if self.rich:
|
|
35
|
+
_console.rule(f"[bold]{text}")
|
|
36
|
+
else:
|
|
37
|
+
print("\n" + "=" * 70)
|
|
38
|
+
if text:
|
|
39
|
+
print(text)
|
|
40
|
+
print("-" * 70)
|
|
41
|
+
|
|
42
|
+
def say(self, text: str = "") -> None:
|
|
43
|
+
(_console.print if self.rich else print)(text)
|
|
44
|
+
|
|
45
|
+
def warn(self, text: str) -> None:
|
|
46
|
+
self.say(f"[yellow]! {text}[/yellow]" if self.rich else f"! {text}")
|
|
47
|
+
|
|
48
|
+
def ok(self, text: str) -> None:
|
|
49
|
+
self.say(f"[green]\u2713 {text}[/green]" if self.rich else f"\u2713 {text}")
|
|
50
|
+
|
|
51
|
+
def err(self, text: str) -> None:
|
|
52
|
+
self.say(f"[red]\u2717 {text}[/red]" if self.rich else f"\u2717 {text}")
|
|
53
|
+
|
|
54
|
+
def candidates(self, cands: list[Candidate]) -> None:
|
|
55
|
+
if not cands:
|
|
56
|
+
self.warn("No suggestions found. Try 's <better search words>' or 'h <HP:id>'.")
|
|
57
|
+
return
|
|
58
|
+
if self.rich:
|
|
59
|
+
t = Table(show_header=True, header_style="bold cyan", box=None, pad_edge=False)
|
|
60
|
+
t.add_column("#", justify="right")
|
|
61
|
+
t.add_column("HPO id", style="magenta", no_wrap=True)
|
|
62
|
+
t.add_column("Label", style="white")
|
|
63
|
+
t.add_column("Score", justify="right")
|
|
64
|
+
t.add_column("Matched on", style="dim")
|
|
65
|
+
t.add_column("Definition", style="dim")
|
|
66
|
+
for i, c in enumerate(cands, 1):
|
|
67
|
+
t.add_row(str(i), c.id, c.label, f"{c.score:g}", c.matched_on, c.short_def(70))
|
|
68
|
+
_console.print(t)
|
|
69
|
+
else:
|
|
70
|
+
for i, c in enumerate(cands, 1):
|
|
71
|
+
print(f" {i:>2}. {c.id} {c.label} (score {c.score:g}, {c.matched_on})")
|
|
72
|
+
if c.short_def(80):
|
|
73
|
+
print(f" {c.short_def(80)}")
|
|
74
|
+
|
|
75
|
+
def prompt(self, text: str) -> str:
|
|
76
|
+
try:
|
|
77
|
+
return input(text)
|
|
78
|
+
except EOFError:
|
|
79
|
+
return "q"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
HELP = """\
|
|
83
|
+
Choices:
|
|
84
|
+
<number> select that suggestion
|
|
85
|
+
s <words> search again with different words
|
|
86
|
+
h <HP:id> enter an HPO id manually (e.g. h HP:0001657) - it is validated
|
|
87
|
+
k keep the current term unchanged
|
|
88
|
+
x skip this label (leave it unmapped)
|
|
89
|
+
? show this help
|
|
90
|
+
q finish now: apply decisions made so far and save
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def build_suggester(args, ui: UI) -> Suggester:
|
|
95
|
+
backend = args.backend
|
|
96
|
+
if backend == "ols":
|
|
97
|
+
ui.say("Backend: EBI OLS (online).")
|
|
98
|
+
return Suggester(OlsClient(), name="ols")
|
|
99
|
+
if backend == "local":
|
|
100
|
+
idx = HpoIndex.load(
|
|
101
|
+
args.hpo_file,
|
|
102
|
+
allow_download=not args.no_download,
|
|
103
|
+
update=args.update,
|
|
104
|
+
cache_dir=args.cache_dir,
|
|
105
|
+
log=ui.say,
|
|
106
|
+
)
|
|
107
|
+
ui.say(f"Backend: local HPO ({len(idx)} terms from {idx.source}).")
|
|
108
|
+
return Suggester(idx, name="hpo-local")
|
|
109
|
+
# auto
|
|
110
|
+
ols = OlsClient()
|
|
111
|
+
if not args.no_online and ols.available():
|
|
112
|
+
ui.say("Backend: EBI OLS (online).")
|
|
113
|
+
# Local is used as a silent fallback only if it is already cheap to load.
|
|
114
|
+
fb = None
|
|
115
|
+
try:
|
|
116
|
+
if args.hpo_file or (Path(args.cache_dir or _default_cache()) / "hp.obo").exists():
|
|
117
|
+
fb = HpoIndex.load(
|
|
118
|
+
args.hpo_file, allow_download=False,
|
|
119
|
+
cache_dir=args.cache_dir, log=lambda *a, **k: None,
|
|
120
|
+
)
|
|
121
|
+
except Exception:
|
|
122
|
+
fb = None
|
|
123
|
+
return Suggester(ols, fallback=fb, name="ols")
|
|
124
|
+
ui.warn("OLS not reachable; using local HPO.")
|
|
125
|
+
idx = HpoIndex.load(
|
|
126
|
+
args.hpo_file,
|
|
127
|
+
allow_download=not args.no_download,
|
|
128
|
+
update=args.update,
|
|
129
|
+
cache_dir=args.cache_dir,
|
|
130
|
+
log=ui.say,
|
|
131
|
+
)
|
|
132
|
+
ui.say(f"Backend: local HPO ({len(idx)} terms from {idx.source}).")
|
|
133
|
+
return Suggester(idx, name="hpo-local")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _default_cache():
|
|
137
|
+
from .hpo import default_cache_dir
|
|
138
|
+
|
|
139
|
+
return default_cache_dir()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def review_group(
|
|
143
|
+
ui: UI,
|
|
144
|
+
suggester: Suggester,
|
|
145
|
+
label: str,
|
|
146
|
+
existing_id: str,
|
|
147
|
+
count: int,
|
|
148
|
+
index: int,
|
|
149
|
+
total: int,
|
|
150
|
+
remembered: dict[str, dict],
|
|
151
|
+
limit: int,
|
|
152
|
+
) -> dict | None:
|
|
153
|
+
"""Drive the interactive review of one label group.
|
|
154
|
+
|
|
155
|
+
Returns a decision dict {id,label,action} or None to abort/quit.
|
|
156
|
+
The special action 'quit' tells the caller to stop and save.
|
|
157
|
+
"""
|
|
158
|
+
ui.rule(f"[{index}/{total}] phenotype label: {label!r} ({count} occurrence(s))")
|
|
159
|
+
|
|
160
|
+
if existing_id:
|
|
161
|
+
existing = suggester.resolve(existing_id)
|
|
162
|
+
if existing and existing.label.lower() == label.lower():
|
|
163
|
+
ui.ok(f"Already a valid HPO term: {existing.id} {existing.label}")
|
|
164
|
+
elif existing:
|
|
165
|
+
ui.warn(
|
|
166
|
+
f"Existing id {existing_id} resolves to {existing.id} '{existing.label}', "
|
|
167
|
+
f"which differs from the label '{label}'."
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
ui.warn(f"Existing id {existing_id!r} could not be validated against HPO.")
|
|
171
|
+
|
|
172
|
+
query = label
|
|
173
|
+
cands = suggester.suggest(query, limit=limit)
|
|
174
|
+
|
|
175
|
+
# Float a remembered decision for this label to the very top.
|
|
176
|
+
mem = remembered.get(label.lower())
|
|
177
|
+
if mem:
|
|
178
|
+
ui.ok(f"Remembered from earlier: {mem['id']} {mem['label']} (enter 'r' to reuse)")
|
|
179
|
+
|
|
180
|
+
while True:
|
|
181
|
+
ui.candidates(cands)
|
|
182
|
+
raw = ui.prompt("Select [#, s words, h HP:id, r, k, x, ?, q]: ").strip()
|
|
183
|
+
|
|
184
|
+
if raw == "":
|
|
185
|
+
continue
|
|
186
|
+
if raw in ("?", "help"):
|
|
187
|
+
ui.say(HELP)
|
|
188
|
+
continue
|
|
189
|
+
if raw in ("q", "quit"):
|
|
190
|
+
return {"action": "quit"}
|
|
191
|
+
if raw in ("k", "keep"):
|
|
192
|
+
return {"action": "keep"}
|
|
193
|
+
if raw in ("x", "skip"):
|
|
194
|
+
return {"action": "skip"}
|
|
195
|
+
if raw in ("r", "reuse") and mem:
|
|
196
|
+
return {"action": "map", "id": mem["id"], "label": mem["label"]}
|
|
197
|
+
|
|
198
|
+
# search again
|
|
199
|
+
if raw.startswith("s ") or raw == "s":
|
|
200
|
+
query = raw[2:].strip() or ui.prompt("Search words: ").strip()
|
|
201
|
+
if query:
|
|
202
|
+
cands = suggester.suggest(query, limit=limit)
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
# explicit manual id (h HP:0001657) or a bare curie
|
|
206
|
+
manual = None
|
|
207
|
+
if raw.startswith("h "):
|
|
208
|
+
manual = raw[2:].strip()
|
|
209
|
+
elif normalise_curie(raw):
|
|
210
|
+
# Only treat as a curie if it is NOT a small selection index.
|
|
211
|
+
if not (raw.isdigit() and 1 <= int(raw) <= len(cands)):
|
|
212
|
+
manual = raw
|
|
213
|
+
if manual is not None:
|
|
214
|
+
c = normalise_curie(manual)
|
|
215
|
+
if not c:
|
|
216
|
+
ui.err(f"{manual!r} is not a valid HPO id (expected HP:0000000).")
|
|
217
|
+
continue
|
|
218
|
+
resolved = suggester.resolve(c)
|
|
219
|
+
if resolved:
|
|
220
|
+
ui.ok(f"{resolved.id} {resolved.label}")
|
|
221
|
+
if _confirm(ui, "Use this term?"):
|
|
222
|
+
return {"action": "map", "id": resolved.id, "label": resolved.label}
|
|
223
|
+
continue
|
|
224
|
+
ui.warn(f"Could not validate {c} against HPO.")
|
|
225
|
+
if _confirm(ui, f"Use {c} anyway with label {label!r}?"):
|
|
226
|
+
return {"action": "map", "id": c, "label": label}
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# numeric selection
|
|
230
|
+
if raw.isdigit():
|
|
231
|
+
i = int(raw)
|
|
232
|
+
if 1 <= i <= len(cands):
|
|
233
|
+
chosen = cands[i - 1]
|
|
234
|
+
return {"action": "map", "id": chosen.id, "label": chosen.label}
|
|
235
|
+
ui.err(f"Pick a number between 1 and {len(cands)}.")
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
ui.err("Unrecognised input. Type ? for help.")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _confirm(ui: UI, text: str) -> bool:
|
|
242
|
+
ans = ui.prompt(f"{text} [Y/n]: ").strip().lower()
|
|
243
|
+
return ans in ("", "y", "yes")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def process_file(path: Path, args, ui: UI, suggester: Suggester, remembered: dict) -> dict:
|
|
247
|
+
ui.rule(f"FILE: {path.name}")
|
|
248
|
+
data = pp.load(path)
|
|
249
|
+
refs = pp.find_phenotypic_features(data)
|
|
250
|
+
if not refs:
|
|
251
|
+
ui.warn("No phenotypic features found in this file.")
|
|
252
|
+
return {"file": str(path), "mapped": 0, "kept": 0, "skipped": 0, "total": 0}
|
|
253
|
+
|
|
254
|
+
groups = pp.group_by_label(refs)
|
|
255
|
+
ui.say(f"Found {len(refs)} phenotypic feature(s) across {len(groups)} distinct label(s).")
|
|
256
|
+
|
|
257
|
+
decisions: list[dict] = []
|
|
258
|
+
stop = False
|
|
259
|
+
items = list(groups.items())
|
|
260
|
+
for index, ((label, existing_id), group_refs) in enumerate(items, 1):
|
|
261
|
+
if stop:
|
|
262
|
+
break
|
|
263
|
+
if args.auto_remembered and label.lower() in remembered:
|
|
264
|
+
mem = remembered[label.lower()]
|
|
265
|
+
for r in group_refs:
|
|
266
|
+
r.apply(mem["id"], mem["label"])
|
|
267
|
+
ui.ok(f"[auto] {label!r} -> {mem['id']} {mem['label']}")
|
|
268
|
+
decisions.append({"label": label, **mem, "action": "map", "count": len(group_refs)})
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
decision = review_group(
|
|
272
|
+
ui, suggester, label, existing_id, len(group_refs),
|
|
273
|
+
index, len(items), remembered, args.limit,
|
|
274
|
+
)
|
|
275
|
+
action = decision.get("action")
|
|
276
|
+
if action == "quit":
|
|
277
|
+
stop = True
|
|
278
|
+
ui.warn("Finishing early - remaining labels left unchanged.")
|
|
279
|
+
break
|
|
280
|
+
if action == "keep":
|
|
281
|
+
ui.say(f"Kept {label!r} unchanged.")
|
|
282
|
+
decisions.append({"label": label, "action": "keep", "count": len(group_refs)})
|
|
283
|
+
continue
|
|
284
|
+
if action == "skip":
|
|
285
|
+
ui.say(f"Skipped {label!r}.")
|
|
286
|
+
decisions.append({"label": label, "action": "skip", "count": len(group_refs)})
|
|
287
|
+
continue
|
|
288
|
+
# map
|
|
289
|
+
cid, clabel = decision["id"], decision["label"]
|
|
290
|
+
for r in group_refs:
|
|
291
|
+
r.apply(cid, clabel)
|
|
292
|
+
remembered[label.lower()] = {"id": cid, "label": clabel}
|
|
293
|
+
ui.ok(f"Mapped {label!r} -> {cid} {clabel} ({len(group_refs)} occurrence(s)).")
|
|
294
|
+
decisions.append({"label": label, "id": cid, "label_hpo": clabel,
|
|
295
|
+
"action": "map", "count": len(group_refs)})
|
|
296
|
+
|
|
297
|
+
# ---- write output
|
|
298
|
+
if args.out and len(args._files) == 1:
|
|
299
|
+
out_path = Path(args.out)
|
|
300
|
+
elif args.in_place:
|
|
301
|
+
out_path = path
|
|
302
|
+
else:
|
|
303
|
+
out_path = path.with_name(path.stem + ".normalized.json")
|
|
304
|
+
pp.save(data, out_path)
|
|
305
|
+
ui.ok(f"Wrote normalised file: {out_path}")
|
|
306
|
+
|
|
307
|
+
# ---- write a mapping sidecar
|
|
308
|
+
sidecar = out_path.with_suffix(out_path.suffix + ".mapping.json")
|
|
309
|
+
pp.save({"file": str(path), "decisions": decisions}, sidecar)
|
|
310
|
+
ui.say(f"Wrote decision log: {sidecar}")
|
|
311
|
+
|
|
312
|
+
mapped = sum(d["count"] for d in decisions if d["action"] == "map")
|
|
313
|
+
kept = sum(d["count"] for d in decisions if d["action"] == "keep")
|
|
314
|
+
skipped = sum(d["count"] for d in decisions if d["action"] == "skip")
|
|
315
|
+
return {"file": str(path), "mapped": mapped, "kept": kept,
|
|
316
|
+
"skipped": skipped, "total": len(refs)}
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def main(argv: list[str] | None = None) -> int:
|
|
320
|
+
p = argparse.ArgumentParser(
|
|
321
|
+
prog="hponorm",
|
|
322
|
+
description="Interactively map free-text phenopacket phenotype labels to HPO terms.",
|
|
323
|
+
)
|
|
324
|
+
p.add_argument("files", nargs="+", help="Phenopacket / family JSON file(s).")
|
|
325
|
+
p.add_argument("-o", "--out", help="Output path (only valid with a single input file).")
|
|
326
|
+
p.add_argument("--in-place", action="store_true", help="Overwrite the input file(s).")
|
|
327
|
+
p.add_argument("--backend", choices=["auto", "ols", "local"], default="auto",
|
|
328
|
+
help="Suggestion source (default: auto -> OLS online, else local).")
|
|
329
|
+
p.add_argument("--hpo-file", help="Path to a local hp.obo or hp.json.")
|
|
330
|
+
p.add_argument("--cache-dir", help="Where to cache the downloaded HPO file.")
|
|
331
|
+
p.add_argument("--update", action="store_true", help="Re-download / re-parse HPO.")
|
|
332
|
+
p.add_argument("--no-download", action="store_true",
|
|
333
|
+
help="Never download HPO (local backend must find a file).")
|
|
334
|
+
p.add_argument("--no-online", action="store_true",
|
|
335
|
+
help="Do not use OLS even in auto mode.")
|
|
336
|
+
p.add_argument("--limit", type=int, default=8, help="Suggestions shown per label.")
|
|
337
|
+
p.add_argument("--mapping", help="JSON file of remembered label->term decisions to "
|
|
338
|
+
"load and update across runs.")
|
|
339
|
+
p.add_argument("--auto-remembered", action="store_true",
|
|
340
|
+
help="Auto-apply remembered labels without prompting.")
|
|
341
|
+
p.add_argument("--no-color", action="store_true", help="Disable rich/colour output.")
|
|
342
|
+
args = p.parse_args(argv)
|
|
343
|
+
args._files = args.files
|
|
344
|
+
|
|
345
|
+
ui = UI(color=not args.no_color)
|
|
346
|
+
|
|
347
|
+
if args.out and len(args.files) > 1:
|
|
348
|
+
ui.err("--out can only be used with a single input file.")
|
|
349
|
+
return 2
|
|
350
|
+
|
|
351
|
+
# Load remembered decisions, if any.
|
|
352
|
+
remembered: dict[str, dict] = {}
|
|
353
|
+
if args.mapping and Path(args.mapping).exists():
|
|
354
|
+
try:
|
|
355
|
+
remembered = json.loads(Path(args.mapping).read_text())
|
|
356
|
+
ui.say(f"Loaded {len(remembered)} remembered mapping(s) from {args.mapping}.")
|
|
357
|
+
except Exception:
|
|
358
|
+
ui.warn(f"Could not read mapping file {args.mapping}; starting fresh.")
|
|
359
|
+
|
|
360
|
+
try:
|
|
361
|
+
suggester = build_suggester(args, ui)
|
|
362
|
+
except Exception as exc:
|
|
363
|
+
ui.err(f"Could not initialise an HPO backend: {exc}")
|
|
364
|
+
return 1
|
|
365
|
+
|
|
366
|
+
summaries = []
|
|
367
|
+
for f in args.files:
|
|
368
|
+
path = Path(f)
|
|
369
|
+
if not path.exists():
|
|
370
|
+
ui.err(f"File not found: {path}")
|
|
371
|
+
continue
|
|
372
|
+
summaries.append(process_file(path, args, ui, suggester, remembered))
|
|
373
|
+
|
|
374
|
+
# Persist remembered decisions for next time.
|
|
375
|
+
if args.mapping:
|
|
376
|
+
try:
|
|
377
|
+
Path(args.mapping).write_text(json.dumps(remembered, indent=2, ensure_ascii=False))
|
|
378
|
+
ui.say(f"Saved {len(remembered)} remembered mapping(s) to {args.mapping}.")
|
|
379
|
+
except Exception as exc:
|
|
380
|
+
ui.warn(f"Could not write mapping file: {exc}")
|
|
381
|
+
|
|
382
|
+
ui.rule("Summary")
|
|
383
|
+
for s in summaries:
|
|
384
|
+
ui.say(f"{Path(s['file']).name}: {s['mapped']} mapped, {s['kept']} kept, "
|
|
385
|
+
f"{s['skipped']} skipped (of {s['total']} feature(s)).")
|
|
386
|
+
return 0
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
if __name__ == "__main__":
|
|
390
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Local HPO (Human Phenotype Ontology) index.
|
|
2
|
+
|
|
3
|
+
Loads HPO from a local ``.obo`` or obographs ``.json`` file (downloading and
|
|
4
|
+
caching the official release on first use if permitted), builds an in-memory
|
|
5
|
+
search index over term labels + synonyms, and answers two questions:
|
|
6
|
+
|
|
7
|
+
* ``search(query)`` -> ranked candidate terms for a free-text phenotype label
|
|
8
|
+
* ``get(curie)`` -> the canonical term for a specific ``HP:xxxxxxx`` id
|
|
9
|
+
|
|
10
|
+
Fuzzy matching uses ``rapidfuzz`` when available and falls back to the stdlib
|
|
11
|
+
``difflib`` otherwise, so the tool runs even in a minimal environment.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import pickle
|
|
18
|
+
import re
|
|
19
|
+
import urllib.request
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .suggest import Candidate
|
|
24
|
+
|
|
25
|
+
# Official, version-pinned-to-"latest" PURLs maintained by the Monarch Initiative.
|
|
26
|
+
HPO_OBO_URL = "http://purl.obolibrary.org/obo/hp.obo"
|
|
27
|
+
HPO_JSON_URL = "http://purl.obolibrary.org/obo/hp.json"
|
|
28
|
+
|
|
29
|
+
_CURIE_RE = re.compile(r"^HP:\d{7}$")
|
|
30
|
+
_IRI_RE = re.compile(r"HP_(\d{7})")
|
|
31
|
+
|
|
32
|
+
try: # Optional, much faster + better quality.
|
|
33
|
+
from rapidfuzz import fuzz, process
|
|
34
|
+
|
|
35
|
+
_HAVE_RAPIDFUZZ = True
|
|
36
|
+
except Exception: # pragma: no cover - exercised only when dep is absent
|
|
37
|
+
import difflib
|
|
38
|
+
|
|
39
|
+
_HAVE_RAPIDFUZZ = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class HpoTerm:
|
|
44
|
+
id: str
|
|
45
|
+
label: str
|
|
46
|
+
synonyms: list[str] = field(default_factory=list)
|
|
47
|
+
definition: str | None = None
|
|
48
|
+
obsolete: bool = False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def default_cache_dir() -> Path:
|
|
52
|
+
base = os.environ.get("XDG_CACHE_HOME") or str(Path.home() / ".cache")
|
|
53
|
+
return Path(base) / "hponorm"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def normalise_curie(raw: str) -> str | None:
|
|
57
|
+
"""Coerce user input / IRIs to a canonical ``HP:xxxxxxx`` curie, or None."""
|
|
58
|
+
s = raw.strip()
|
|
59
|
+
if not s:
|
|
60
|
+
return None
|
|
61
|
+
m = _IRI_RE.search(s)
|
|
62
|
+
if m:
|
|
63
|
+
return f"HP:{m.group(1)}"
|
|
64
|
+
s = s.replace("_", ":")
|
|
65
|
+
if not s.upper().startswith("HP"):
|
|
66
|
+
# Allow bare 7-digit ids like "0001657".
|
|
67
|
+
digits = re.sub(r"\D", "", s)
|
|
68
|
+
if len(digits) == 7:
|
|
69
|
+
return f"HP:{digits}"
|
|
70
|
+
return None
|
|
71
|
+
digits = re.sub(r"\D", "", s)
|
|
72
|
+
if len(digits) != 7:
|
|
73
|
+
return None
|
|
74
|
+
return f"HP:{digits}"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class HpoIndex:
|
|
78
|
+
"""An in-memory, searchable index of HPO phenotype terms."""
|
|
79
|
+
|
|
80
|
+
name = "hpo-local"
|
|
81
|
+
|
|
82
|
+
def __init__(self, terms: dict[str, HpoTerm], source: str):
|
|
83
|
+
self.source = source
|
|
84
|
+
self.terms: dict[str, HpoTerm] = terms
|
|
85
|
+
# Parallel arrays for fast fuzzy scanning: searchable string -> term id.
|
|
86
|
+
self._names: list[str] = []
|
|
87
|
+
self._owner: list[str] = [] # term id for _names[i]
|
|
88
|
+
self._kind: list[str] = [] # "label" or "synonym"
|
|
89
|
+
self._exact: dict[str, list[tuple[str, str]]] = {} # lower text -> [(id, kind)]
|
|
90
|
+
for t in terms.values():
|
|
91
|
+
if t.obsolete:
|
|
92
|
+
continue
|
|
93
|
+
self._add_name(t.id, t.label, "label")
|
|
94
|
+
for syn in t.synonyms:
|
|
95
|
+
self._add_name(t.id, syn, "synonym")
|
|
96
|
+
|
|
97
|
+
def _add_name(self, term_id: str, text: str, kind: str) -> None:
|
|
98
|
+
text = (text or "").strip()
|
|
99
|
+
if not text:
|
|
100
|
+
return
|
|
101
|
+
self._names.append(text)
|
|
102
|
+
self._owner.append(term_id)
|
|
103
|
+
self._kind.append(kind)
|
|
104
|
+
self._exact.setdefault(text.lower(), []).append((term_id, kind))
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------ lookup
|
|
107
|
+
def get(self, curie: str) -> HpoTerm | None:
|
|
108
|
+
c = normalise_curie(curie)
|
|
109
|
+
return self.terms.get(c) if c else None
|
|
110
|
+
|
|
111
|
+
def __len__(self) -> int:
|
|
112
|
+
return len(self.terms)
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------ search
|
|
115
|
+
def search(self, query: str, limit: int = 8) -> list[Candidate]:
|
|
116
|
+
query = (query or "").strip()
|
|
117
|
+
if not query:
|
|
118
|
+
return []
|
|
119
|
+
scored: dict[str, Candidate] = {}
|
|
120
|
+
|
|
121
|
+
def consider(term_id: str, score: float, matched_on: str, kind: str) -> None:
|
|
122
|
+
t = self.terms.get(term_id)
|
|
123
|
+
if t is None or t.obsolete:
|
|
124
|
+
return
|
|
125
|
+
tag = "label" if kind == "label" else f"synonym: {matched_on}"
|
|
126
|
+
prev = scored.get(term_id)
|
|
127
|
+
if prev is None or score > prev.score:
|
|
128
|
+
scored[term_id] = Candidate(
|
|
129
|
+
id=t.id,
|
|
130
|
+
label=t.label,
|
|
131
|
+
score=round(float(score), 1),
|
|
132
|
+
matched_on=tag,
|
|
133
|
+
definition=t.definition,
|
|
134
|
+
source="hpo-local",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# 1) Exact (case-insensitive) label / synonym hits get a strong score.
|
|
138
|
+
for term_id, kind in self._exact.get(query.lower(), []):
|
|
139
|
+
consider(term_id, 100.0 if kind == "label" else 98.0, query, kind)
|
|
140
|
+
|
|
141
|
+
# 2) Fuzzy scan across all label + synonym strings.
|
|
142
|
+
if _HAVE_RAPIDFUZZ:
|
|
143
|
+
matches = process.extract(
|
|
144
|
+
query, self._names, scorer=fuzz.WRatio, limit=limit * 6
|
|
145
|
+
)
|
|
146
|
+
for matched_text, score, idx in matches:
|
|
147
|
+
consider(self._owner[idx], score, matched_text, self._kind[idx])
|
|
148
|
+
else: # difflib fallback
|
|
149
|
+
ratios = (
|
|
150
|
+
(difflib.SequenceMatcher(None, query.lower(), n.lower()).ratio(), i)
|
|
151
|
+
for i, n in enumerate(self._names)
|
|
152
|
+
)
|
|
153
|
+
top = sorted(ratios, reverse=True)[: limit * 6]
|
|
154
|
+
for ratio, idx in top:
|
|
155
|
+
consider(
|
|
156
|
+
self._owner[idx], ratio * 100.0, self._names[idx], self._kind[idx]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
ranked = sorted(scored.values(), key=lambda c: c.score, reverse=True)
|
|
160
|
+
return ranked[:limit]
|
|
161
|
+
|
|
162
|
+
# ----------------------------------------------------------- construction
|
|
163
|
+
@classmethod
|
|
164
|
+
def load(
|
|
165
|
+
cls,
|
|
166
|
+
path: str | os.PathLike | None = None,
|
|
167
|
+
*,
|
|
168
|
+
allow_download: bool = True,
|
|
169
|
+
update: bool = False,
|
|
170
|
+
cache_dir: str | os.PathLike | None = None,
|
|
171
|
+
log=lambda *a, **k: None,
|
|
172
|
+
) -> "HpoIndex":
|
|
173
|
+
"""Load HPO from *path*, the cache, or by downloading the release."""
|
|
174
|
+
cache = Path(cache_dir) if cache_dir else default_cache_dir()
|
|
175
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
src_path: Path
|
|
178
|
+
if path is not None:
|
|
179
|
+
src_path = Path(path)
|
|
180
|
+
if not src_path.exists():
|
|
181
|
+
raise FileNotFoundError(f"HPO file not found: {src_path}")
|
|
182
|
+
else:
|
|
183
|
+
src_path = cache / "hp.obo"
|
|
184
|
+
if update or not src_path.exists():
|
|
185
|
+
if not allow_download:
|
|
186
|
+
raise FileNotFoundError(
|
|
187
|
+
"No local HPO file and downloading is disabled. "
|
|
188
|
+
"Pass --hpo-file or allow downloading."
|
|
189
|
+
)
|
|
190
|
+
log(f"Downloading HPO from {HPO_OBO_URL} (one-time) ...")
|
|
191
|
+
_download(HPO_OBO_URL, src_path)
|
|
192
|
+
log(f"Saved HPO to {src_path}")
|
|
193
|
+
|
|
194
|
+
# Use a parsed-index pickle cache keyed on source size+mtime.
|
|
195
|
+
stat = src_path.stat()
|
|
196
|
+
key = f"{src_path.name}-{stat.st_size}-{int(stat.st_mtime)}.idx.pkl"
|
|
197
|
+
idx_cache = cache / key
|
|
198
|
+
if idx_cache.exists() and not update:
|
|
199
|
+
try:
|
|
200
|
+
with idx_cache.open("rb") as fh:
|
|
201
|
+
terms = pickle.load(fh)
|
|
202
|
+
log(f"Loaded {len(terms)} HPO terms from cache.")
|
|
203
|
+
return cls(terms, source=str(src_path))
|
|
204
|
+
except Exception:
|
|
205
|
+
pass # fall through and re-parse
|
|
206
|
+
|
|
207
|
+
log(f"Parsing HPO from {src_path} ...")
|
|
208
|
+
terms = _parse_file(src_path)
|
|
209
|
+
log(f"Indexed {len(terms)} HPO terms.")
|
|
210
|
+
try:
|
|
211
|
+
with idx_cache.open("wb") as fh:
|
|
212
|
+
pickle.dump(terms, fh, protocol=pickle.HIGHEST_PROTOCOL)
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
return cls(terms, source=str(src_path))
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# --------------------------------------------------------------------- parsing
|
|
219
|
+
def _download(url: str, dest: Path, timeout: int = 120) -> None:
|
|
220
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
tmp = dest.with_suffix(dest.suffix + ".part")
|
|
222
|
+
req = urllib.request.Request(url, headers={"User-Agent": "hponorm/1.0"})
|
|
223
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp, tmp.open("wb") as out:
|
|
224
|
+
while True:
|
|
225
|
+
chunk = resp.read(1 << 16)
|
|
226
|
+
if not chunk:
|
|
227
|
+
break
|
|
228
|
+
out.write(chunk)
|
|
229
|
+
tmp.replace(dest)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _parse_file(path: Path) -> dict[str, HpoTerm]:
|
|
233
|
+
head = path.open("rb").read(256).lstrip()
|
|
234
|
+
if head[:1] == b"{":
|
|
235
|
+
return _parse_obographs(path)
|
|
236
|
+
return _parse_obo(path)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _parse_obo(path: Path) -> dict[str, HpoTerm]:
|
|
240
|
+
terms: dict[str, HpoTerm] = {}
|
|
241
|
+
cur: dict | None = None
|
|
242
|
+
in_term = False
|
|
243
|
+
|
|
244
|
+
def flush(block: dict | None) -> None:
|
|
245
|
+
if not block:
|
|
246
|
+
return
|
|
247
|
+
tid = block.get("id")
|
|
248
|
+
if not tid or not _CURIE_RE.match(tid):
|
|
249
|
+
return
|
|
250
|
+
terms[tid] = HpoTerm(
|
|
251
|
+
id=tid,
|
|
252
|
+
label=block.get("name", tid),
|
|
253
|
+
synonyms=block.get("synonyms", []),
|
|
254
|
+
definition=block.get("def"),
|
|
255
|
+
obsolete=block.get("obsolete", False),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
syn_re = re.compile(r'synonym:\s*"((?:[^"\\]|\\.)*)"')
|
|
259
|
+
def_re = re.compile(r'def:\s*"((?:[^"\\]|\\.)*)"')
|
|
260
|
+
with path.open("r", encoding="utf-8", errors="replace") as fh:
|
|
261
|
+
for line in fh:
|
|
262
|
+
line = line.rstrip("\n")
|
|
263
|
+
if line.startswith("[Term]"):
|
|
264
|
+
flush(cur)
|
|
265
|
+
cur = {"synonyms": []}
|
|
266
|
+
in_term = True
|
|
267
|
+
continue
|
|
268
|
+
if line.startswith("[") and line.endswith("]"):
|
|
269
|
+
flush(cur)
|
|
270
|
+
cur = None
|
|
271
|
+
in_term = False
|
|
272
|
+
continue
|
|
273
|
+
if not in_term or cur is None:
|
|
274
|
+
continue
|
|
275
|
+
if line.startswith("id:"):
|
|
276
|
+
cur["id"] = line[3:].strip()
|
|
277
|
+
elif line.startswith("name:"):
|
|
278
|
+
cur["name"] = line[5:].strip()
|
|
279
|
+
elif line.startswith("def:"):
|
|
280
|
+
m = def_re.match(line)
|
|
281
|
+
if m:
|
|
282
|
+
cur["def"] = _unescape(m.group(1))
|
|
283
|
+
elif line.startswith("synonym:"):
|
|
284
|
+
m = syn_re.match(line)
|
|
285
|
+
if m:
|
|
286
|
+
cur["synonyms"].append(_unescape(m.group(1)))
|
|
287
|
+
elif line.startswith("is_obsolete:") and line.split(":", 1)[1].strip() == "true":
|
|
288
|
+
cur["obsolete"] = True
|
|
289
|
+
flush(cur)
|
|
290
|
+
return terms
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _parse_obographs(path: Path) -> dict[str, HpoTerm]:
|
|
294
|
+
with path.open("r", encoding="utf-8", errors="replace") as fh:
|
|
295
|
+
data = json.load(fh)
|
|
296
|
+
terms: dict[str, HpoTerm] = {}
|
|
297
|
+
for graph in data.get("graphs", []):
|
|
298
|
+
for node in graph.get("nodes", []):
|
|
299
|
+
m = _IRI_RE.search(node.get("id", ""))
|
|
300
|
+
if not m:
|
|
301
|
+
continue
|
|
302
|
+
tid = f"HP:{m.group(1)}"
|
|
303
|
+
meta = node.get("meta", {}) or {}
|
|
304
|
+
synonyms = [s.get("val", "") for s in meta.get("synonyms", []) if s.get("val")]
|
|
305
|
+
definition = (meta.get("definition") or {}).get("val")
|
|
306
|
+
terms[tid] = HpoTerm(
|
|
307
|
+
id=tid,
|
|
308
|
+
label=node.get("lbl") or tid,
|
|
309
|
+
synonyms=synonyms,
|
|
310
|
+
definition=definition,
|
|
311
|
+
obsolete=bool(meta.get("deprecated", False)),
|
|
312
|
+
)
|
|
313
|
+
return terms
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _unescape(s: str) -> str:
|
|
317
|
+
return s.replace('\\"', '"').replace("\\n", " ").replace("\\\\", "\\").strip()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""EBI Ontology Lookup Service (OLS4) backend for HPO.
|
|
2
|
+
|
|
3
|
+
Provides high-quality ranked search over the live HPO ontology without any
|
|
4
|
+
local download. Used when the machine is online; otherwise the CLI falls back
|
|
5
|
+
to the local :class:`~hponorm.hpo.HpoIndex`.
|
|
6
|
+
|
|
7
|
+
OLS4 API docs: https://www.ebi.ac.uk/ols4/help
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import urllib.request
|
|
14
|
+
|
|
15
|
+
from .suggest import Candidate
|
|
16
|
+
|
|
17
|
+
OLS_BASE = "https://www.ebi.ac.uk/ols4/api"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OlsClient:
|
|
21
|
+
name = "ols"
|
|
22
|
+
|
|
23
|
+
def __init__(self, base: str = OLS_BASE, timeout: int = 15):
|
|
24
|
+
self.base = base.rstrip("/")
|
|
25
|
+
self.timeout = timeout
|
|
26
|
+
|
|
27
|
+
# ------------------------------------------------------------------ utils
|
|
28
|
+
def _get_json(self, url: str) -> dict:
|
|
29
|
+
req = urllib.request.Request(url, headers={"User-Agent": "hponorm/1.0"})
|
|
30
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
31
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
32
|
+
|
|
33
|
+
def available(self) -> bool:
|
|
34
|
+
try:
|
|
35
|
+
self._get_json(f"{self.base}/ontologies/hp")
|
|
36
|
+
return True
|
|
37
|
+
except Exception:
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
# ----------------------------------------------------------------- search
|
|
41
|
+
def search(self, query: str, limit: int = 8) -> list[Candidate]:
|
|
42
|
+
query = (query or "").strip()
|
|
43
|
+
if not query:
|
|
44
|
+
return []
|
|
45
|
+
params = urllib.parse.urlencode(
|
|
46
|
+
{
|
|
47
|
+
"q": query,
|
|
48
|
+
"ontology": "hp",
|
|
49
|
+
"type": "class",
|
|
50
|
+
"rows": max(limit, 10),
|
|
51
|
+
"fieldList": "obo_id,label,description,synonym,is_obsolete",
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
data = self._get_json(f"{self.base}/search?{params}")
|
|
55
|
+
docs = (data.get("response") or {}).get("docs", [])
|
|
56
|
+
out: list[Candidate] = []
|
|
57
|
+
seen: set[str] = set()
|
|
58
|
+
# OLS returns docs in relevance order; map that to a descending score.
|
|
59
|
+
for rank, doc in enumerate(docs):
|
|
60
|
+
obo_id = doc.get("obo_id")
|
|
61
|
+
if not obo_id or not obo_id.startswith("HP:"):
|
|
62
|
+
continue
|
|
63
|
+
if doc.get("is_obsolete"):
|
|
64
|
+
continue
|
|
65
|
+
if obo_id in seen:
|
|
66
|
+
continue
|
|
67
|
+
seen.add(obo_id)
|
|
68
|
+
desc = doc.get("description")
|
|
69
|
+
if isinstance(desc, list):
|
|
70
|
+
desc = desc[0] if desc else None
|
|
71
|
+
out.append(
|
|
72
|
+
Candidate(
|
|
73
|
+
id=obo_id,
|
|
74
|
+
label=doc.get("label") or obo_id,
|
|
75
|
+
score=round(max(1.0, 100.0 - rank * 4), 1),
|
|
76
|
+
matched_on="OLS relevance",
|
|
77
|
+
definition=desc,
|
|
78
|
+
source="ols",
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
if len(out) >= limit:
|
|
82
|
+
break
|
|
83
|
+
return out
|
|
84
|
+
|
|
85
|
+
# ------------------------------------------------------------------- term
|
|
86
|
+
def get(self, curie: str) -> Candidate | None:
|
|
87
|
+
from .hpo import normalise_curie
|
|
88
|
+
|
|
89
|
+
c = normalise_curie(curie)
|
|
90
|
+
if not c:
|
|
91
|
+
return None
|
|
92
|
+
iri = f"http://purl.obolibrary.org/obo/{c.replace(':', '_')}"
|
|
93
|
+
params = urllib.parse.urlencode({"iri": iri})
|
|
94
|
+
try:
|
|
95
|
+
data = self._get_json(f"{self.base}/ontologies/hp/terms?{params}")
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
terms = (data.get("_embedded") or {}).get("terms", [])
|
|
99
|
+
if not terms:
|
|
100
|
+
return None
|
|
101
|
+
t = terms[0]
|
|
102
|
+
desc = t.get("description")
|
|
103
|
+
if isinstance(desc, list):
|
|
104
|
+
desc = desc[0] if desc else None
|
|
105
|
+
return Candidate(
|
|
106
|
+
id=c,
|
|
107
|
+
label=t.get("label") or c,
|
|
108
|
+
score=100.0,
|
|
109
|
+
matched_on="exact id",
|
|
110
|
+
definition=desc,
|
|
111
|
+
source="ols",
|
|
112
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Read a phenopacket / family JSON, locate every phenotypic-feature ``type``,
|
|
2
|
+
apply chosen HPO mappings in place, and write the normalised file back out.
|
|
3
|
+
|
|
4
|
+
The finder is deliberately schema-tolerant: it recursively walks the document
|
|
5
|
+
and collects the ``type`` object of every entry under any ``phenotypicFeatures``
|
|
6
|
+
array. That covers the GA4GH ``Phenopacket.phenotypicFeatures`` and
|
|
7
|
+
``Family.proband`` / ``Family.relatives`` layouts (and any nesting of them)
|
|
8
|
+
without hard-coding paths. An entry's ``excluded`` flag is left untouched -- it
|
|
9
|
+
qualifies the feature, not the term, so the ``type`` is mapped either way.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class FeatureRef:
|
|
20
|
+
"""A live, mutable reference to one phenotypic-feature ``type`` object."""
|
|
21
|
+
|
|
22
|
+
path: str
|
|
23
|
+
type: dict # the actual {"id": ..., "label": ...} dict inside the loaded JSON
|
|
24
|
+
excluded: bool = False # GA4GH PhenotypicFeature.excluded (informational)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def label(self) -> str:
|
|
28
|
+
return (self.type.get("label") or "").strip()
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def id(self) -> str:
|
|
32
|
+
return (self.type.get("id") or "").strip()
|
|
33
|
+
|
|
34
|
+
def apply(self, curie: str, label: str) -> None:
|
|
35
|
+
self.type["id"] = curie
|
|
36
|
+
self.type["label"] = label
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load(path: str | Path) -> dict:
|
|
40
|
+
with Path(path).open("r", encoding="utf-8") as fh:
|
|
41
|
+
return json.load(fh)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def find_phenotypic_features(obj, path: str = "$") -> list[FeatureRef]:
|
|
45
|
+
"""Recursively collect every phenotypic-feature ``type`` dict in *obj*."""
|
|
46
|
+
found: list[FeatureRef] = []
|
|
47
|
+
|
|
48
|
+
def walk(node, p):
|
|
49
|
+
if isinstance(node, dict):
|
|
50
|
+
for key, value in node.items():
|
|
51
|
+
if key == "phenotypicFeatures" and isinstance(value, list):
|
|
52
|
+
for i, entry in enumerate(value):
|
|
53
|
+
if isinstance(entry, dict) and isinstance(
|
|
54
|
+
entry.get("type"), dict
|
|
55
|
+
):
|
|
56
|
+
found.append(
|
|
57
|
+
FeatureRef(
|
|
58
|
+
f"{p}.phenotypicFeatures[{i}].type",
|
|
59
|
+
entry["type"],
|
|
60
|
+
bool(entry.get("excluded", False)),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
walk(entry, f"{p}.phenotypicFeatures[{i}]")
|
|
64
|
+
else:
|
|
65
|
+
walk(value, f"{p}.{key}")
|
|
66
|
+
elif isinstance(node, list):
|
|
67
|
+
for i, item in enumerate(node):
|
|
68
|
+
walk(item, f"{p}[{i}]")
|
|
69
|
+
|
|
70
|
+
walk(obj, path)
|
|
71
|
+
return found
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def group_by_label(refs: list[FeatureRef]) -> "dict[tuple[str, str], list[FeatureRef]]":
|
|
75
|
+
"""Group feature references by their current (label, id) so each distinct
|
|
76
|
+
label is reviewed once and the decision applied to every occurrence."""
|
|
77
|
+
groups: dict[tuple[str, str], list[FeatureRef]] = {}
|
|
78
|
+
for ref in refs:
|
|
79
|
+
groups.setdefault((ref.label, ref.id), []).append(ref)
|
|
80
|
+
return groups
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def save(data: dict, path: str | Path) -> None:
|
|
84
|
+
with Path(path).open("w", encoding="utf-8") as fh:
|
|
85
|
+
json.dump(data, fh, indent=2, ensure_ascii=False)
|
|
86
|
+
fh.write("\n")
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Unified suggestion layer.
|
|
2
|
+
|
|
3
|
+
A ``Suggester`` wraps one of two interchangeable backends:
|
|
4
|
+
|
|
5
|
+
* ``hpo`` - a local :class:`~hponorm.hpo.HpoIndex` (offline capable)
|
|
6
|
+
* ``ols`` - the EBI Ontology Lookup Service v4 (online, best ranking)
|
|
7
|
+
|
|
8
|
+
and exposes a backend-agnostic API used by the CLI:
|
|
9
|
+
|
|
10
|
+
* ``suggest(label)`` -> ranked list of :class:`Candidate`
|
|
11
|
+
* ``resolve(curie)`` -> validate a specific HPO id and return its canonical term
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Candidate:
|
|
20
|
+
"""A single suggested ontology term."""
|
|
21
|
+
|
|
22
|
+
id: str # e.g. "HP:0001657"
|
|
23
|
+
label: str # canonical HPO label
|
|
24
|
+
score: float # 0-100 match score (higher is better)
|
|
25
|
+
matched_on: str # "label" or 'synonym: "..."'
|
|
26
|
+
definition: str | None = None
|
|
27
|
+
source: str = "" # which backend produced it
|
|
28
|
+
|
|
29
|
+
def short_def(self, n: int = 90) -> str:
|
|
30
|
+
if not self.definition:
|
|
31
|
+
return ""
|
|
32
|
+
d = " ".join(self.definition.split())
|
|
33
|
+
return d if len(d) <= n else d[: n - 1] + "\u2026"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Suggester:
|
|
37
|
+
def __init__(self, backend, *, fallback=None, name: str = ""):
|
|
38
|
+
self.backend = backend
|
|
39
|
+
self.fallback = fallback
|
|
40
|
+
self.name = name or getattr(backend, "name", "backend")
|
|
41
|
+
|
|
42
|
+
def suggest(self, label: str, limit: int = 8) -> list[Candidate]:
|
|
43
|
+
try:
|
|
44
|
+
results = self.backend.search(label, limit=limit)
|
|
45
|
+
except Exception:
|
|
46
|
+
results = []
|
|
47
|
+
if not results and self.fallback is not None:
|
|
48
|
+
try:
|
|
49
|
+
results = self.fallback.search(label, limit=limit)
|
|
50
|
+
except Exception:
|
|
51
|
+
results = []
|
|
52
|
+
return results
|
|
53
|
+
|
|
54
|
+
def resolve(self, curie: str) -> Candidate | None:
|
|
55
|
+
"""Look up a specific HPO id to validate it and fetch its canonical label."""
|
|
56
|
+
for be in (self.backend, self.fallback):
|
|
57
|
+
if be is None:
|
|
58
|
+
continue
|
|
59
|
+
try:
|
|
60
|
+
term = be.get(curie)
|
|
61
|
+
except Exception:
|
|
62
|
+
term = None
|
|
63
|
+
if term is not None:
|
|
64
|
+
# ``HpoIndex.get`` returns an HpoTerm; OLS returns a Candidate.
|
|
65
|
+
if isinstance(term, Candidate):
|
|
66
|
+
return term
|
|
67
|
+
return Candidate(
|
|
68
|
+
id=term.id,
|
|
69
|
+
label=term.label,
|
|
70
|
+
score=100.0,
|
|
71
|
+
matched_on="exact id",
|
|
72
|
+
definition=getattr(term, "definition", None),
|
|
73
|
+
source=getattr(be, "name", self.name),
|
|
74
|
+
)
|
|
75
|
+
return None
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hponorm
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Interactively validate and normalise GA4GH phenopacket phenotypic-feature terms against the HPO ontology
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Provides-Extra: recommended
|
|
8
|
+
Requires-Dist: rapidfuzz>=3.0; extra == "recommended"
|
|
9
|
+
Requires-Dist: rich>=13.0; extra == "recommended"
|
|
10
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
hponorm/cli.py
|
|
5
|
+
hponorm/hpo.py
|
|
6
|
+
hponorm/ols.py
|
|
7
|
+
hponorm/phenopackets.py
|
|
8
|
+
hponorm/suggest.py
|
|
9
|
+
hponorm.egg-info/PKG-INFO
|
|
10
|
+
hponorm.egg-info/SOURCES.txt
|
|
11
|
+
hponorm.egg-info/dependency_links.txt
|
|
12
|
+
hponorm.egg-info/entry_points.txt
|
|
13
|
+
hponorm.egg-info/requires.txt
|
|
14
|
+
hponorm.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hponorm
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hponorm"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Interactively validate and normalise GA4GH phenopacket phenotypic-feature terms against the HPO ontology"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = [] # stdlib-only by default
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
recommended = ["rapidfuzz>=3.0", "rich>=13.0"]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
hponorm = "hponorm.cli:main"
|
|
17
|
+
|
|
18
|
+
[tool.setuptools]
|
|
19
|
+
packages = ["hponorm"]
|
|
20
|
+
|
|
21
|
+
[tool.setuptools.package-data]
|
|
22
|
+
hponorm = ["data/*.obo"]
|
hponorm-1.0.0/setup.cfg
ADDED