ddharmon 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddharmon-0.1.0/LICENSE +21 -0
- ddharmon-0.1.0/PKG-INFO +316 -0
- ddharmon-0.1.0/README.md +287 -0
- ddharmon-0.1.0/pyproject.toml +92 -0
- ddharmon-0.1.0/src/ddharmon/__init__.py +54 -0
- ddharmon-0.1.0/src/ddharmon/client.py +264 -0
- ddharmon-0.1.0/src/ddharmon/exceptions.py +39 -0
- ddharmon-0.1.0/src/ddharmon/extras/__init__.py +1 -0
- ddharmon-0.1.0/src/ddharmon/extras/metabolon/__init__.py +29 -0
- ddharmon-0.1.0/src/ddharmon/extras/metabolon/export.py +107 -0
- ddharmon-0.1.0/src/ddharmon/extras/metabolon/preprocessing.py +239 -0
- ddharmon-0.1.0/src/ddharmon/mapper.py +117 -0
- ddharmon-0.1.0/src/ddharmon/models.py +218 -0
ddharmon-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Trent Leslie
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ddharmon-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ddharmon
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python client for the BioMapper2 API — map biological entities to standardized knowledge graph identifiers
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: bioinformatics,metabolomics,knowledge-graph,biomapper,multi-omics
|
|
7
|
+
Author: Trent Leslie
|
|
8
|
+
Author-email: trent@phenomehealth.org
|
|
9
|
+
Requires-Python: >=3.11,<4.0
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Provides-Extra: all
|
|
19
|
+
Provides-Extra: metabolon
|
|
20
|
+
Provides-Extra: notebook
|
|
21
|
+
Requires-Dist: httpx (>=0.27,<0.28)
|
|
22
|
+
Requires-Dist: pydantic (>=2.0,<3.0)
|
|
23
|
+
Requires-Dist: python-dotenv (>=1.0,<2.0)
|
|
24
|
+
Project-URL: Documentation, https://github.com/trentleslie/ddharmon#readme
|
|
25
|
+
Project-URL: Homepage, https://github.com/trentleslie/ddharmon
|
|
26
|
+
Project-URL: Repository, https://github.com/trentleslie/ddharmon
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# ddharmon
|
|
30
|
+
|
|
31
|
+
Python client for the **BioMapper2 API** — map biological entity names to
|
|
32
|
+
standardized knowledge-graph identifiers (CHEBI, HMDB, PubChem, RefMet, and more).
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from ddharmon import map_entity
|
|
36
|
+
|
|
37
|
+
result = map_entity("L-Histidine")
|
|
38
|
+
print(result.primary_curie) # RM:0129894
|
|
39
|
+
print(result.confidence_tier) # high
|
|
40
|
+
print(result.ids_for("CHEBI")) # ['15971']
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Core (async HTTP client + Pydantic models)
|
|
49
|
+
pip install ddharmon
|
|
50
|
+
|
|
51
|
+
# With Metabolon preprocessing utilities (pandas, openpyxl)
|
|
52
|
+
pip install "ddharmon[metabolon]"
|
|
53
|
+
|
|
54
|
+
# With notebook progress bars (tqdm, nest-asyncio)
|
|
55
|
+
pip install "ddharmon[notebook]"
|
|
56
|
+
|
|
57
|
+
# Everything
|
|
58
|
+
pip install "ddharmon[all]"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Getting an API key
|
|
64
|
+
|
|
65
|
+
The BioMapper2 API requires an API key. To request access, email
|
|
66
|
+
[trent.leslie@phenomehealth.org](mailto:trent.leslie@phenomehealth.org).
|
|
67
|
+
|
|
68
|
+
Once you have a key, set it in your environment:
|
|
69
|
+
```bash
|
|
70
|
+
export BIOMAPPER_API_KEY=your-key-here
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or add it to a `.env` file in your project root:
|
|
74
|
+
```
|
|
75
|
+
BIOMAPPER_API_KEY=your-key-here
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
ddharmon will pick it up automatically from either location.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Quick start
|
|
83
|
+
|
|
84
|
+
### Single lookup (synchronous)
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from ddharmon import map_entity
|
|
88
|
+
|
|
89
|
+
result = map_entity("L-Histidine")
|
|
90
|
+
|
|
91
|
+
print(result.resolved) # True
|
|
92
|
+
print(result.primary_curie) # RM:0129894
|
|
93
|
+
print(result.chosen_kg_id) # CHEBI:15971
|
|
94
|
+
print(result.confidence_score) # 2.489
|
|
95
|
+
print(result.confidence_tier) # high (≥2.0)
|
|
96
|
+
print(result.ids_for("CHEBI")) # ['15971']
|
|
97
|
+
print(result.ids_for("refmet_id")) # ['RM0129894']
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Batch mapping (synchronous)
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from ddharmon import map_entities, summarize
|
|
104
|
+
|
|
105
|
+
records = [
|
|
106
|
+
{"name": "L-Histidine"},
|
|
107
|
+
{"name": "Glucose", "identifiers": {"HMDB": "HMDB00122"}},
|
|
108
|
+
{"name": "Sphinganine"},
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
results = map_entities(records, progress=True) # tqdm bar with [notebook]
|
|
112
|
+
summary = summarize(results)
|
|
113
|
+
|
|
114
|
+
print(f"{summary.resolved}/{summary.total_queried} resolved")
|
|
115
|
+
print(f"Resolution rate: {summary.resolution_rate:.1%}")
|
|
116
|
+
print(summary.vocabulary_coverage)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Async usage
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
import asyncio
|
|
123
|
+
from ddharmon import BioMapperClient
|
|
124
|
+
|
|
125
|
+
async def main() -> None:
|
|
126
|
+
async with BioMapperClient() as client:
|
|
127
|
+
# Verify connectivity
|
|
128
|
+
health = await client.health_check()
|
|
129
|
+
print(health) # {'status': 'healthy', ...}
|
|
130
|
+
|
|
131
|
+
# Single
|
|
132
|
+
result = await client.map_entity(
|
|
133
|
+
"L-Histidine",
|
|
134
|
+
identifiers={"HMDB": "HMDB00177"},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Batch with rate limiting
|
|
138
|
+
results = await client.map_entities(
|
|
139
|
+
[{"name": "L-Histidine"}, {"name": "Glucose"}],
|
|
140
|
+
rate_limit_delay=0.3,
|
|
141
|
+
progress=True,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
asyncio.run(main())
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Jupyter notebooks
|
|
148
|
+
|
|
149
|
+
Apply `nest_asyncio` before using sync helpers inside a running event loop:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import nest_asyncio
|
|
153
|
+
nest_asyncio.apply() # required in Jupyter
|
|
154
|
+
|
|
155
|
+
from ddharmon import map_entities
|
|
156
|
+
results = map_entities([{"name": "L-Histidine"}], progress=True)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Metabolon extras
|
|
162
|
+
|
|
163
|
+
The `ddharmon[metabolon]` extra ships helpers that replicate and generalize the
|
|
164
|
+
preprocessing from the BioVector-eval Metabolon tutorial notebook.
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
import pandas as pd
|
|
168
|
+
from ddharmon import map_entities, summarize
|
|
169
|
+
from ddharmon.extras.metabolon import (
|
|
170
|
+
build_mapping_queue,
|
|
171
|
+
clean_compound_name,
|
|
172
|
+
extract_hmdb_id,
|
|
173
|
+
)
|
|
174
|
+
from ddharmon.extras.metabolon.export import save_results, results_to_dataframe
|
|
175
|
+
|
|
176
|
+
# 1. Load your Metabolon features spreadsheet
|
|
177
|
+
df = pd.read_excel("Metabolon_unknown_combined_features_metadata.xlsx")
|
|
178
|
+
|
|
179
|
+
# 2. Build a deduplicated mapping queue
|
|
180
|
+
# - cleans compound names (strips quotes, _CE## suffixes)
|
|
181
|
+
# - extracts HMDB hints from ms1_compound_name
|
|
182
|
+
# - deduplicates by cleaned name, tracking all feature_ids
|
|
183
|
+
queue = build_mapping_queue(
|
|
184
|
+
df,
|
|
185
|
+
name_col="matched_name",
|
|
186
|
+
hint_col="ms1_compound_name",
|
|
187
|
+
limit=50, # set to None for full run
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
print(f"{len(queue)} unique names to map")
|
|
191
|
+
print(f" with HMDB hints: {sum(1 for r in queue if r.hmdb_hint)}")
|
|
192
|
+
|
|
193
|
+
# 3. Map (convert queue → API records first)
|
|
194
|
+
results = map_entities(
|
|
195
|
+
[r.as_api_record() for r in queue],
|
|
196
|
+
rate_limit_delay=0.3,
|
|
197
|
+
progress=True,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# 4. Summarize
|
|
201
|
+
summary = summarize(results)
|
|
202
|
+
print(f"Resolution rate: {summary.resolution_rate:.1%}")
|
|
203
|
+
|
|
204
|
+
# 5. Export
|
|
205
|
+
save_results(
|
|
206
|
+
results,
|
|
207
|
+
summary=summary,
|
|
208
|
+
json_path="output/mapping.json",
|
|
209
|
+
tsv_path="output/mapping.tsv",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Or work directly in pandas
|
|
213
|
+
result_df = results_to_dataframe(results)
|
|
214
|
+
print(result_df[["query_name", "primary_curie", "confidence_tier"]].head())
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Preprocessing functions
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from ddharmon.extras.metabolon import clean_compound_name, extract_hmdb_id
|
|
221
|
+
|
|
222
|
+
# Strip quotes and collision-energy suffixes
|
|
223
|
+
clean_compound_name('"1,3-Diphenylguanidine_CE45"') # '1,3-Diphenylguanidine'
|
|
224
|
+
clean_compound_name('"4,6-DIOXOHEPTANOIC ACID"') # '4,6-DIOXOHEPTANOIC ACID'
|
|
225
|
+
clean_compound_name('L-Histidine') # 'L-Histidine' (unchanged)
|
|
226
|
+
|
|
227
|
+
# Extract HMDB accessions from ms1_compound_name format
|
|
228
|
+
extract_hmdb_id('HMDB:HMDB03349-2257 L-Dihydroorotic acid') # 'HMDB03349'
|
|
229
|
+
extract_hmdb_id('HMDB00177') # 'HMDB00177'
|
|
230
|
+
extract_hmdb_id(None) # None
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## API reference
|
|
236
|
+
|
|
237
|
+
### `MappingResult`
|
|
238
|
+
|
|
239
|
+
| Attribute | Type | Description |
|
|
240
|
+
|---|---|---|
|
|
241
|
+
| `query_name` | `str` | Name submitted to the API |
|
|
242
|
+
| `resolved` | `bool` | Whether any identifier was returned |
|
|
243
|
+
| `primary_curie` | `str \| None` | First CURIE in the response |
|
|
244
|
+
| `chosen_kg_id` | `str \| None` | Resolver-selected knowledge graph ID |
|
|
245
|
+
| `confidence_score` | `float \| None` | Highest score across annotators |
|
|
246
|
+
| `confidence_tier` | `str` | `"high"` (≥2.0) / `"medium"` (1–2) / `"low"` (<1) / `"unknown"` |
|
|
247
|
+
| `identifiers` | `dict[str, list[str]]` | Vocabulary → IDs, e.g. `{"CHEBI": ["15971"]}` |
|
|
248
|
+
| `hmdb_hint` | `str \| None` | HMDB hint passed in the request |
|
|
249
|
+
| `error` | `str \| None` | Error message if mapping failed |
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
result.ids_for("CHEBI") # ['15971']
|
|
253
|
+
result.ids_for("refmet_id") # ['RM0129894']
|
|
254
|
+
result.ids_for("PUBCHEM.COMPOUND") # []
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Confidence tiers
|
|
258
|
+
|
|
259
|
+
| Score | Tier | Recommended action |
|
|
260
|
+
|---|---|---|
|
|
261
|
+
| ≥ 2.0 | `high` | Accept without review |
|
|
262
|
+
| 1.0–2.0 | `medium` | Quick sanity check |
|
|
263
|
+
| < 1.0 | `low` | Manual review recommended |
|
|
264
|
+
| `None` | `unknown` | No score returned (e.g. HMDB-hint resolved) |
|
|
265
|
+
|
|
266
|
+
### Error handling
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
from ddharmon import (
|
|
270
|
+
BioMapperError, # base class
|
|
271
|
+
BioMapperAuthError, # 401/403 — bad API key
|
|
272
|
+
BioMapperRateLimitError, # 429 — throttled
|
|
273
|
+
BioMapperServerError, # 5xx
|
|
274
|
+
BioMapperTimeoutError, # request timeout
|
|
275
|
+
BioMapperConfigError, # missing API key / bad config
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
result = map_entity("Glucose")
|
|
280
|
+
except BioMapperRateLimitError as e:
|
|
281
|
+
print(f"Throttled. Retry after: {e.retry_after}s")
|
|
282
|
+
except BioMapperAuthError:
|
|
283
|
+
print("Check your BIOMAPPER_API_KEY")
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
In batch mode (`map_entities`), per-record errors are caught and returned as
|
|
287
|
+
`MappingResult(error=...)` rather than aborting the batch.
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## Development
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
git clone https://github.com/trentleslie/ddharmon
|
|
295
|
+
cd ddharmon
|
|
296
|
+
poetry install --with dev --extras all
|
|
297
|
+
|
|
298
|
+
make check # format → lint → type-check → test
|
|
299
|
+
make test # tests only
|
|
300
|
+
make coverage # HTML coverage report
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## License
|
|
306
|
+
|
|
307
|
+
MIT — see [LICENSE](LICENSE).
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## Related
|
|
312
|
+
|
|
313
|
+
- **BioMapper2 API**: `https://biomapper.expertintheloop.io`
|
|
314
|
+
- **EITL platform**: `https://expertintheloop.io`
|
|
315
|
+
- **biovector-eval notebooks**: `https://github.com/trentleslie/biovector-eval`
|
|
316
|
+
|
ddharmon-0.1.0/README.md
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
# ddharmon
|
|
2
|
+
|
|
3
|
+
Python client for the **BioMapper2 API** — map biological entity names to
|
|
4
|
+
standardized knowledge-graph identifiers (CHEBI, HMDB, PubChem, RefMet, and more).
|
|
5
|
+
|
|
6
|
+
```python
|
|
7
|
+
from ddharmon import map_entity
|
|
8
|
+
|
|
9
|
+
result = map_entity("L-Histidine")
|
|
10
|
+
print(result.primary_curie) # RM:0129894
|
|
11
|
+
print(result.confidence_tier) # high
|
|
12
|
+
print(result.ids_for("CHEBI")) # ['15971']
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Core (async HTTP client + Pydantic models)
|
|
21
|
+
pip install ddharmon
|
|
22
|
+
|
|
23
|
+
# With Metabolon preprocessing utilities (pandas, openpyxl)
|
|
24
|
+
pip install "ddharmon[metabolon]"
|
|
25
|
+
|
|
26
|
+
# With notebook progress bars (tqdm, nest-asyncio)
|
|
27
|
+
pip install "ddharmon[notebook]"
|
|
28
|
+
|
|
29
|
+
# Everything
|
|
30
|
+
pip install "ddharmon[all]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Getting an API key
|
|
36
|
+
|
|
37
|
+
The BioMapper2 API requires an API key. To request access, email
|
|
38
|
+
[trent.leslie@phenomehealth.org](mailto:trent.leslie@phenomehealth.org).
|
|
39
|
+
|
|
40
|
+
Once you have a key, set it in your environment:
|
|
41
|
+
```bash
|
|
42
|
+
export BIOMAPPER_API_KEY=your-key-here
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or add it to a `.env` file in your project root:
|
|
46
|
+
```
|
|
47
|
+
BIOMAPPER_API_KEY=your-key-here
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
ddharmon will pick it up automatically from either location.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Quick start
|
|
55
|
+
|
|
56
|
+
### Single lookup (synchronous)
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from ddharmon import map_entity
|
|
60
|
+
|
|
61
|
+
result = map_entity("L-Histidine")
|
|
62
|
+
|
|
63
|
+
print(result.resolved) # True
|
|
64
|
+
print(result.primary_curie) # RM:0129894
|
|
65
|
+
print(result.chosen_kg_id) # CHEBI:15971
|
|
66
|
+
print(result.confidence_score) # 2.489
|
|
67
|
+
print(result.confidence_tier) # high (≥2.0)
|
|
68
|
+
print(result.ids_for("CHEBI")) # ['15971']
|
|
69
|
+
print(result.ids_for("refmet_id")) # ['RM0129894']
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Batch mapping (synchronous)
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from ddharmon import map_entities, summarize
|
|
76
|
+
|
|
77
|
+
records = [
|
|
78
|
+
{"name": "L-Histidine"},
|
|
79
|
+
{"name": "Glucose", "identifiers": {"HMDB": "HMDB00122"}},
|
|
80
|
+
{"name": "Sphinganine"},
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
results = map_entities(records, progress=True) # tqdm bar with [notebook]
|
|
84
|
+
summary = summarize(results)
|
|
85
|
+
|
|
86
|
+
print(f"{summary.resolved}/{summary.total_queried} resolved")
|
|
87
|
+
print(f"Resolution rate: {summary.resolution_rate:.1%}")
|
|
88
|
+
print(summary.vocabulary_coverage)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Async usage
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
import asyncio
|
|
95
|
+
from ddharmon import BioMapperClient
|
|
96
|
+
|
|
97
|
+
async def main() -> None:
|
|
98
|
+
async with BioMapperClient() as client:
|
|
99
|
+
# Verify connectivity
|
|
100
|
+
health = await client.health_check()
|
|
101
|
+
print(health) # {'status': 'healthy', ...}
|
|
102
|
+
|
|
103
|
+
# Single
|
|
104
|
+
result = await client.map_entity(
|
|
105
|
+
"L-Histidine",
|
|
106
|
+
identifiers={"HMDB": "HMDB00177"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Batch with rate limiting
|
|
110
|
+
results = await client.map_entities(
|
|
111
|
+
[{"name": "L-Histidine"}, {"name": "Glucose"}],
|
|
112
|
+
rate_limit_delay=0.3,
|
|
113
|
+
progress=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
asyncio.run(main())
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Jupyter notebooks
|
|
120
|
+
|
|
121
|
+
Apply `nest_asyncio` before using sync helpers inside a running event loop:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import nest_asyncio
|
|
125
|
+
nest_asyncio.apply() # required in Jupyter
|
|
126
|
+
|
|
127
|
+
from ddharmon import map_entities
|
|
128
|
+
results = map_entities([{"name": "L-Histidine"}], progress=True)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Metabolon extras
|
|
134
|
+
|
|
135
|
+
The `ddharmon[metabolon]` extra ships helpers that replicate and generalize the
|
|
136
|
+
preprocessing from the BioVector-eval Metabolon tutorial notebook.
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
import pandas as pd
|
|
140
|
+
from ddharmon import map_entities, summarize
|
|
141
|
+
from ddharmon.extras.metabolon import (
|
|
142
|
+
build_mapping_queue,
|
|
143
|
+
clean_compound_name,
|
|
144
|
+
extract_hmdb_id,
|
|
145
|
+
)
|
|
146
|
+
from ddharmon.extras.metabolon.export import save_results, results_to_dataframe
|
|
147
|
+
|
|
148
|
+
# 1. Load your Metabolon features spreadsheet
|
|
149
|
+
df = pd.read_excel("Metabolon_unknown_combined_features_metadata.xlsx")
|
|
150
|
+
|
|
151
|
+
# 2. Build a deduplicated mapping queue
|
|
152
|
+
# - cleans compound names (strips quotes, _CE## suffixes)
|
|
153
|
+
# - extracts HMDB hints from ms1_compound_name
|
|
154
|
+
# - deduplicates by cleaned name, tracking all feature_ids
|
|
155
|
+
queue = build_mapping_queue(
|
|
156
|
+
df,
|
|
157
|
+
name_col="matched_name",
|
|
158
|
+
hint_col="ms1_compound_name",
|
|
159
|
+
limit=50, # set to None for full run
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
print(f"{len(queue)} unique names to map")
|
|
163
|
+
print(f" with HMDB hints: {sum(1 for r in queue if r.hmdb_hint)}")
|
|
164
|
+
|
|
165
|
+
# 3. Map (convert queue → API records first)
|
|
166
|
+
results = map_entities(
|
|
167
|
+
[r.as_api_record() for r in queue],
|
|
168
|
+
rate_limit_delay=0.3,
|
|
169
|
+
progress=True,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# 4. Summarize
|
|
173
|
+
summary = summarize(results)
|
|
174
|
+
print(f"Resolution rate: {summary.resolution_rate:.1%}")
|
|
175
|
+
|
|
176
|
+
# 5. Export
|
|
177
|
+
save_results(
|
|
178
|
+
results,
|
|
179
|
+
summary=summary,
|
|
180
|
+
json_path="output/mapping.json",
|
|
181
|
+
tsv_path="output/mapping.tsv",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Or work directly in pandas
|
|
185
|
+
result_df = results_to_dataframe(results)
|
|
186
|
+
print(result_df[["query_name", "primary_curie", "confidence_tier"]].head())
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Preprocessing functions
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from ddharmon.extras.metabolon import clean_compound_name, extract_hmdb_id
|
|
193
|
+
|
|
194
|
+
# Strip quotes and collision-energy suffixes
|
|
195
|
+
clean_compound_name('"1,3-Diphenylguanidine_CE45"') # '1,3-Diphenylguanidine'
|
|
196
|
+
clean_compound_name('"4,6-DIOXOHEPTANOIC ACID"') # '4,6-DIOXOHEPTANOIC ACID'
|
|
197
|
+
clean_compound_name('L-Histidine') # 'L-Histidine' (unchanged)
|
|
198
|
+
|
|
199
|
+
# Extract HMDB accessions from ms1_compound_name format
|
|
200
|
+
extract_hmdb_id('HMDB:HMDB03349-2257 L-Dihydroorotic acid') # 'HMDB03349'
|
|
201
|
+
extract_hmdb_id('HMDB00177') # 'HMDB00177'
|
|
202
|
+
extract_hmdb_id(None) # None
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## API reference
|
|
208
|
+
|
|
209
|
+
### `MappingResult`
|
|
210
|
+
|
|
211
|
+
| Attribute | Type | Description |
|
|
212
|
+
|---|---|---|
|
|
213
|
+
| `query_name` | `str` | Name submitted to the API |
|
|
214
|
+
| `resolved` | `bool` | Whether any identifier was returned |
|
|
215
|
+
| `primary_curie` | `str \| None` | First CURIE in the response |
|
|
216
|
+
| `chosen_kg_id` | `str \| None` | Resolver-selected knowledge graph ID |
|
|
217
|
+
| `confidence_score` | `float \| None` | Highest score across annotators |
|
|
218
|
+
| `confidence_tier` | `str` | `"high"` (≥2.0) / `"medium"` (1–2) / `"low"` (<1) / `"unknown"` |
|
|
219
|
+
| `identifiers` | `dict[str, list[str]]` | Vocabulary → IDs, e.g. `{"CHEBI": ["15971"]}` |
|
|
220
|
+
| `hmdb_hint` | `str \| None` | HMDB hint passed in the request |
|
|
221
|
+
| `error` | `str \| None` | Error message if mapping failed |
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
result.ids_for("CHEBI") # ['15971']
|
|
225
|
+
result.ids_for("refmet_id") # ['RM0129894']
|
|
226
|
+
result.ids_for("PUBCHEM.COMPOUND") # []
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Confidence tiers
|
|
230
|
+
|
|
231
|
+
| Score | Tier | Recommended action |
|
|
232
|
+
|---|---|---|
|
|
233
|
+
| ≥ 2.0 | `high` | Accept without review |
|
|
234
|
+
| 1.0–2.0 | `medium` | Quick sanity check |
|
|
235
|
+
| < 1.0 | `low` | Manual review recommended |
|
|
236
|
+
| `None` | `unknown` | No score returned (e.g. HMDB-hint resolved) |
|
|
237
|
+
|
|
238
|
+
### Error handling
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
from ddharmon import (
|
|
242
|
+
BioMapperError, # base class
|
|
243
|
+
BioMapperAuthError, # 401/403 — bad API key
|
|
244
|
+
BioMapperRateLimitError, # 429 — throttled
|
|
245
|
+
BioMapperServerError, # 5xx
|
|
246
|
+
BioMapperTimeoutError, # request timeout
|
|
247
|
+
BioMapperConfigError, # missing API key / bad config
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
result = map_entity("Glucose")
|
|
252
|
+
except BioMapperRateLimitError as e:
|
|
253
|
+
print(f"Throttled. Retry after: {e.retry_after}s")
|
|
254
|
+
except BioMapperAuthError:
|
|
255
|
+
print("Check your BIOMAPPER_API_KEY")
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
In batch mode (`map_entities`), per-record errors are caught and returned as
|
|
259
|
+
`MappingResult(error=...)` rather than aborting the batch.
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## Development
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
git clone https://github.com/trentleslie/ddharmon
|
|
267
|
+
cd ddharmon
|
|
268
|
+
poetry install --with dev --extras all
|
|
269
|
+
|
|
270
|
+
make check # format → lint → type-check → test
|
|
271
|
+
make test # tests only
|
|
272
|
+
make coverage # HTML coverage report
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## License
|
|
278
|
+
|
|
279
|
+
MIT — see [LICENSE](LICENSE).
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## Related
|
|
284
|
+
|
|
285
|
+
- **BioMapper2 API**: `https://biomapper.expertintheloop.io`
|
|
286
|
+
- **EITL platform**: `https://expertintheloop.io`
|
|
287
|
+
- **biovector-eval notebooks**: `https://github.com/trentleslie/biovector-eval`
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "ddharmon"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python client for the BioMapper2 API — map biological entities to standardized knowledge graph identifiers"
|
|
5
|
+
authors = ["Trent Leslie <trent@phenomehealth.org>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://github.com/trentleslie/ddharmon"
|
|
9
|
+
repository = "https://github.com/trentleslie/ddharmon"
|
|
10
|
+
documentation = "https://github.com/trentleslie/ddharmon#readme"
|
|
11
|
+
keywords = ["bioinformatics", "metabolomics", "knowledge-graph", "biomapper", "multi-omics"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
]
|
|
21
|
+
packages = [{include = "ddharmon", from = "src"}]
|
|
22
|
+
|
|
23
|
+
[tool.poetry.dependencies]
|
|
24
|
+
python = "^3.11"
|
|
25
|
+
httpx = "^0.27"
|
|
26
|
+
pydantic = "^2.0"
|
|
27
|
+
python-dotenv = "^1.0"
|
|
28
|
+
|
|
29
|
+
[tool.poetry.extras]
|
|
30
|
+
metabolon = ["pandas", "openpyxl"]
|
|
31
|
+
notebook = ["tqdm", "nest-asyncio"]
|
|
32
|
+
all = ["pandas", "openpyxl", "tqdm", "nest-asyncio"]
|
|
33
|
+
|
|
34
|
+
[tool.poetry.group.metabolon.dependencies]
|
|
35
|
+
pandas = {version = "^2.0", optional = true}
|
|
36
|
+
openpyxl = {version = "^3.1", optional = true}
|
|
37
|
+
|
|
38
|
+
[tool.poetry.group.notebook.dependencies]
|
|
39
|
+
tqdm = {version = "^4.66", optional = true}
|
|
40
|
+
nest-asyncio = {version = "^1.6", optional = true}
|
|
41
|
+
|
|
42
|
+
[tool.poetry.group.dev.dependencies]
|
|
43
|
+
pytest = "^8.0"
|
|
44
|
+
pytest-asyncio = "^0.23"
|
|
45
|
+
pytest-cov = "^5.0"
|
|
46
|
+
respx = "^0.21"
|
|
47
|
+
ruff = "^0.4"
|
|
48
|
+
mypy = "^1.10"
|
|
49
|
+
pandas-stubs = "^2.0"
|
|
50
|
+
types-tqdm = "^4.66"
|
|
51
|
+
|
|
52
|
+
[build-system]
|
|
53
|
+
requires = ["poetry-core"]
|
|
54
|
+
build-backend = "poetry.core.masonry.api"
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Ruff
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
[tool.ruff]
|
|
60
|
+
target-version = "py311"
|
|
61
|
+
line-length = 100
|
|
62
|
+
src = ["src", "tests"]
|
|
63
|
+
|
|
64
|
+
[tool.ruff.lint]
|
|
65
|
+
select = ["E", "F", "I", "UP", "B", "SIM", "ANN"]
|
|
66
|
+
ignore = ["ANN101", "ANN102"]
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint.per-file-ignores]
|
|
69
|
+
"tests/**" = ["ANN"]
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Mypy
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
[tool.mypy]
|
|
75
|
+
python_version = "3.11"
|
|
76
|
+
strict = true
|
|
77
|
+
warn_return_any = true
|
|
78
|
+
warn_unused_configs = true
|
|
79
|
+
disallow_untyped_defs = true
|
|
80
|
+
disallow_any_generics = true
|
|
81
|
+
|
|
82
|
+
[[tool.mypy.overrides]]
|
|
83
|
+
module = ["nest_asyncio"]
|
|
84
|
+
ignore_missing_imports = true
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Pytest
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
[tool.pytest.ini_options]
|
|
90
|
+
asyncio_mode = "auto"
|
|
91
|
+
testpaths = ["tests"]
|
|
92
|
+
addopts = "--cov=ddharmon --cov-report=term-missing --cov-fail-under=80"
|