norm_toolkit 1.0.1__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/PKG-INFO +2 -1
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/pyproject.toml +2 -1
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer_postgres.py +157 -159
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/README.md +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/constants.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer.py +0 -0
- {norm_toolkit-1.0.1 → norm_toolkit-1.1.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: norm_toolkit
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Toolkit to normalize text to UMLS / ontologies
|
|
5
5
|
Author: Haydn Jones
|
|
6
6
|
Author-email: Haydn Jones <haydnjonest@gmail.com>
|
|
@@ -10,6 +10,7 @@ Requires-Dist: lvg-norm>=1.1.0
|
|
|
10
10
|
Requires-Dist: polars[rt64]>=1.36.1
|
|
11
11
|
Requires-Dist: pyarrow>=20.0.0
|
|
12
12
|
Requires-Dist: pydantic>=2.12.5
|
|
13
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
13
14
|
Requires-Dist: tqdm>=4.67.1
|
|
14
15
|
Requires-Python: >=3.12
|
|
15
16
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "norm_toolkit"
|
|
3
|
-
version = "1.0
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "Toolkit to normalize text to UMLS / ontologies"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
|
|
@@ -12,6 +12,7 @@ dependencies = [
|
|
|
12
12
|
"polars[rt64]>=1.36.1",
|
|
13
13
|
"pyarrow>=20.0.0",
|
|
14
14
|
"pydantic>=2.12.5",
|
|
15
|
+
"sqlalchemy>=2.0.0",
|
|
15
16
|
"tqdm>=4.67.1",
|
|
16
17
|
]
|
|
17
18
|
|
|
@@ -7,17 +7,17 @@ built by build_umls_duckdb, build_ontology_duckdb, or build_merged_duckdb.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
import asyncio
|
|
11
10
|
import json
|
|
12
11
|
from collections.abc import Mapping, Sequence
|
|
12
|
+
from typing import Any
|
|
13
13
|
|
|
14
|
-
import asyncpg
|
|
15
14
|
import polars as pl
|
|
16
15
|
from lvg_norm import lvg_normalize
|
|
16
|
+
from sqlalchemy import text
|
|
17
|
+
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
17
18
|
|
|
18
19
|
from norm_toolkit.constants import (
|
|
19
20
|
ATOMS_TABLE,
|
|
20
|
-
CONCEPTS_TABLE,
|
|
21
21
|
DEFAULT_PREFER_TTYS,
|
|
22
22
|
DEFS_TABLE,
|
|
23
23
|
EDGES_TABLE,
|
|
@@ -36,27 +36,34 @@ from norm_toolkit.models import ConceptInfo, SemanticType
|
|
|
36
36
|
|
|
37
37
|
class PostgresNormalizer:
|
|
38
38
|
"""
|
|
39
|
-
Async normalizer using PostgreSQL via
|
|
39
|
+
Async normalizer using PostgreSQL via SQLAlchemy.
|
|
40
40
|
|
|
41
41
|
Optimized for small batch processing (1-5 strings at a time).
|
|
42
42
|
Uses VALUES clauses instead of temp tables for efficiency with small batches.
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
|
-
def __init__(
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
engine: AsyncEngine,
|
|
48
|
+
schema: str = "public",
|
|
49
|
+
owned_resource: Any | None = None,
|
|
50
|
+
) -> None:
|
|
46
51
|
"""
|
|
47
|
-
Initialize the normalizer with an
|
|
52
|
+
Initialize the normalizer with an SQLAlchemy AsyncEngine.
|
|
48
53
|
|
|
49
54
|
Args:
|
|
50
|
-
|
|
55
|
+
engine: SQLAlchemy AsyncEngine (caller manages lifecycle)
|
|
51
56
|
schema: PostgreSQL schema where tables are located (default: "public")
|
|
57
|
+
owned_resource: Optional resource with async close() method to clean up
|
|
58
|
+
when this normalizer is closed (e.g., AlloyDB AsyncConnector)
|
|
52
59
|
|
|
53
60
|
Note:
|
|
54
61
|
After creating the normalizer, call `await normalizer.initialize()`
|
|
55
62
|
to detect database capabilities before using other methods.
|
|
56
63
|
"""
|
|
57
|
-
self.
|
|
64
|
+
self._engine = engine
|
|
58
65
|
self._schema = schema
|
|
59
|
-
self.
|
|
66
|
+
self._owned_resource = owned_resource
|
|
60
67
|
self._has_types = False
|
|
61
68
|
self._has_defs = False
|
|
62
69
|
self._has_edges = False
|
|
@@ -68,48 +75,14 @@ class PostgresNormalizer:
|
|
|
68
75
|
self._ns_table = f"{prefix}{NS_TABLE}"
|
|
69
76
|
self._nw_table = f"{prefix}{NW_TABLE}"
|
|
70
77
|
self._atoms_table = f"{prefix}{ATOMS_TABLE}"
|
|
71
|
-
self._concepts_table = f"{prefix}{CONCEPTS_TABLE}"
|
|
72
78
|
self._types_table = f"{prefix}{TYPES_TABLE}"
|
|
73
79
|
self._defs_table = f"{prefix}{DEFS_TABLE}"
|
|
74
80
|
self._edges_table = f"{prefix}{EDGES_TABLE}"
|
|
75
81
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
Use this factory method for sync-only usage. The normalizer will manage
|
|
82
|
-
its own event loop and pool, allowing you to call normalize_sync().
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
dsn: PostgreSQL connection string (e.g., "postgresql://user:pass@host:5432/db")
|
|
86
|
-
schema: PostgreSQL schema where tables are located (default: "public")
|
|
87
|
-
min_size: Minimum pool connections
|
|
88
|
-
max_size: Maximum pool connections
|
|
89
|
-
|
|
90
|
-
Example:
|
|
91
|
-
>>> normalizer = PostgresNormalizer.create_sync("postgresql://...")
|
|
92
|
-
>>> result = normalizer.normalize_sync(["diabetes"])
|
|
93
|
-
>>> normalizer.close_sync()
|
|
94
|
-
"""
|
|
95
|
-
loop = asyncio.new_event_loop()
|
|
96
|
-
|
|
97
|
-
async def _create():
|
|
98
|
-
pool = await asyncpg.create_pool(dsn, min_size=min_size, max_size=max_size)
|
|
99
|
-
return pool
|
|
100
|
-
|
|
101
|
-
pool = loop.run_until_complete(_create())
|
|
102
|
-
instance = cls(pool, schema=schema)
|
|
103
|
-
instance._loop = loop
|
|
104
|
-
loop.run_until_complete(instance.initialize())
|
|
105
|
-
return instance
|
|
106
|
-
|
|
107
|
-
async def initialize(self) -> None:
|
|
108
|
-
"""
|
|
109
|
-
Detect database capabilities.
|
|
110
|
-
|
|
111
|
-
Must be called after __init__ before using normalize/concept_info methods.
|
|
112
|
-
"""
|
|
82
|
+
async def _ensure_initialized(self) -> None:
|
|
83
|
+
"""Lazily initialize on first use."""
|
|
84
|
+
if self._initialized:
|
|
85
|
+
return
|
|
113
86
|
self._has_types = await self._table_has_rows(self._types_table)
|
|
114
87
|
self._has_defs = await self._table_has_rows(self._defs_table)
|
|
115
88
|
self._has_edges = await self._table_has_rows(self._edges_table)
|
|
@@ -119,18 +92,18 @@ class PostgresNormalizer:
|
|
|
119
92
|
async def _table_has_rows(self, table: str) -> bool:
|
|
120
93
|
"""Check if a table exists and has rows."""
|
|
121
94
|
try:
|
|
122
|
-
async with self.
|
|
123
|
-
result = await
|
|
124
|
-
return result is not None
|
|
95
|
+
async with self._engine.connect() as conn:
|
|
96
|
+
result = await conn.execute(text(f"SELECT 1 FROM {table} LIMIT 1"))
|
|
97
|
+
return result.scalar() is not None
|
|
125
98
|
except Exception:
|
|
126
99
|
return False
|
|
127
100
|
|
|
128
101
|
async def _column_has_values(self, table: str, column: str) -> bool:
|
|
129
102
|
"""Check if a column has any non-null values."""
|
|
130
103
|
try:
|
|
131
|
-
async with self.
|
|
132
|
-
result = await
|
|
133
|
-
return result is not None
|
|
104
|
+
async with self._engine.connect() as conn:
|
|
105
|
+
result = await conn.execute(text(f"SELECT 1 FROM {table} WHERE {column} IS NOT NULL LIMIT 1"))
|
|
106
|
+
return result.scalar() is not None
|
|
134
107
|
except Exception:
|
|
135
108
|
return False
|
|
136
109
|
|
|
@@ -163,6 +136,8 @@ class PostgresNormalizer:
|
|
|
163
136
|
Returns:
|
|
164
137
|
DataFrame with columns: input_string, hits (list of match structs)
|
|
165
138
|
"""
|
|
139
|
+
await self._ensure_initialized()
|
|
140
|
+
|
|
166
141
|
if prefer_ttys is None:
|
|
167
142
|
prefer_ttys = DEFAULT_PREFER_TTYS
|
|
168
143
|
|
|
@@ -214,15 +189,18 @@ class PostgresNormalizer:
|
|
|
214
189
|
{"hits": pl.List(HIT_STRUCT_TYPE)}
|
|
215
190
|
)
|
|
216
191
|
|
|
217
|
-
# Build parameters and VALUES clauses
|
|
218
|
-
params:
|
|
192
|
+
# Build parameters and VALUES clauses using named parameters
|
|
193
|
+
params: dict[str, Any] = {}
|
|
194
|
+
param_idx = 0
|
|
219
195
|
|
|
220
196
|
# qmap VALUES clause
|
|
221
197
|
qmap_placeholders = []
|
|
222
198
|
for q, nstr in qmap_rows:
|
|
223
|
-
|
|
224
|
-
params
|
|
225
|
-
|
|
199
|
+
q_key, nstr_key = f"p{param_idx}", f"p{param_idx + 1}"
|
|
200
|
+
params[q_key] = q
|
|
201
|
+
params[nstr_key] = nstr
|
|
202
|
+
qmap_placeholders.append(f"(:{q_key}, :{nstr_key})")
|
|
203
|
+
param_idx += 2
|
|
226
204
|
qmap_values = ", ".join(qmap_placeholders)
|
|
227
205
|
|
|
228
206
|
# qwords VALUES clause (for partial path)
|
|
@@ -231,36 +209,58 @@ class PostgresNormalizer:
|
|
|
231
209
|
qwords_rows = [(q, n, w) for q, n in qmap_rows for w in dict.fromkeys(n.split()) if w]
|
|
232
210
|
qwords_placeholders = []
|
|
233
211
|
for q, nstr, nwd in qwords_rows:
|
|
234
|
-
|
|
235
|
-
params
|
|
236
|
-
|
|
212
|
+
q_key, nstr_key, nwd_key = f"p{param_idx}", f"p{param_idx + 1}", f"p{param_idx + 2}"
|
|
213
|
+
params[q_key] = q
|
|
214
|
+
params[nstr_key] = nstr
|
|
215
|
+
params[nwd_key] = nwd
|
|
216
|
+
qwords_placeholders.append(f"(:{q_key}, :{nstr_key}, :{nwd_key})")
|
|
217
|
+
param_idx += 3
|
|
237
218
|
qwords_values = ", ".join(qwords_placeholders)
|
|
238
219
|
|
|
239
220
|
# allq VALUES clause (preserve order)
|
|
240
221
|
allq_placeholders = []
|
|
241
222
|
for q in all_queries:
|
|
242
|
-
|
|
243
|
-
params
|
|
244
|
-
allq_placeholders.append(f"(
|
|
223
|
+
q_key = f"p{param_idx}"
|
|
224
|
+
params[q_key] = q
|
|
225
|
+
allq_placeholders.append(f"(:{q_key})")
|
|
226
|
+
param_idx += 1
|
|
245
227
|
allq_values = ", ".join(allq_placeholders)
|
|
246
228
|
|
|
247
|
-
# Build preference clauses
|
|
229
|
+
# Build preference clauses (parameterized to prevent SQL injection)
|
|
248
230
|
tty_join = ""
|
|
249
231
|
tty_bump_expr = "0"
|
|
250
232
|
if prefer_ttys:
|
|
251
|
-
|
|
233
|
+
tty_placeholders = []
|
|
234
|
+
for tty in prefer_ttys:
|
|
235
|
+
key = f"p{param_idx}"
|
|
236
|
+
params[key] = tty
|
|
237
|
+
tty_placeholders.append(f"(:{key})")
|
|
238
|
+
param_idx += 1
|
|
239
|
+
tty_vals = ", ".join(tty_placeholders)
|
|
252
240
|
tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
|
|
253
241
|
tty_bump_expr = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
|
|
254
242
|
|
|
255
|
-
# Source filtering
|
|
243
|
+
# Source filtering (parameterized to prevent SQL injection)
|
|
256
244
|
source_filter_exprs = []
|
|
257
245
|
nw_filter_clauses = []
|
|
258
246
|
if filter_sources:
|
|
259
|
-
|
|
247
|
+
filt_placeholders = []
|
|
248
|
+
for src in filter_sources:
|
|
249
|
+
key = f"p{param_idx}"
|
|
250
|
+
params[key] = src
|
|
251
|
+
filt_placeholders.append(f":{key}")
|
|
252
|
+
param_idx += 1
|
|
253
|
+
filt_vals = ", ".join(filt_placeholders)
|
|
260
254
|
source_filter_exprs.append(f"a.source IN ({filt_vals})")
|
|
261
255
|
nw_filter_clauses.append(f"nw.source IN ({filt_vals})")
|
|
262
256
|
if exclude_sources:
|
|
263
|
-
|
|
257
|
+
excl_placeholders = []
|
|
258
|
+
for src in exclude_sources:
|
|
259
|
+
key = f"p{param_idx}"
|
|
260
|
+
params[key] = src
|
|
261
|
+
excl_placeholders.append(f":{key}")
|
|
262
|
+
param_idx += 1
|
|
263
|
+
excl_vals = ", ".join(excl_placeholders)
|
|
264
264
|
source_filter_exprs.append(f"a.source NOT IN ({excl_vals})")
|
|
265
265
|
nw_filter_clauses.append(f"nw.source NOT IN ({excl_vals})")
|
|
266
266
|
nw_filter_clause = (" AND " + " AND ".join(nw_filter_clauses)) if nw_filter_clauses else ""
|
|
@@ -438,15 +438,22 @@ FROM allq aq
|
|
|
438
438
|
LEFT JOIN agg ON agg.Q = aq.Q;
|
|
439
439
|
"""
|
|
440
440
|
|
|
441
|
-
async with self.
|
|
442
|
-
|
|
441
|
+
async with self._engine.connect() as conn:
|
|
442
|
+
result = await conn.execute(text(sql), params)
|
|
443
|
+
rows = result.mappings().all()
|
|
443
444
|
|
|
444
|
-
# Parse
|
|
445
|
+
# Parse results into Polars DataFrame
|
|
446
|
+
# Note: asyncpg auto-deserializes JSON, so hits may already be a list
|
|
445
447
|
data = []
|
|
446
448
|
for row in rows:
|
|
447
449
|
input_string = row["input_string"]
|
|
448
|
-
|
|
449
|
-
|
|
450
|
+
hits_raw = row["hits"]
|
|
451
|
+
if hits_raw is None:
|
|
452
|
+
hits = []
|
|
453
|
+
elif isinstance(hits_raw, list):
|
|
454
|
+
hits = hits_raw # Already deserialized by asyncpg
|
|
455
|
+
else:
|
|
456
|
+
hits = json.loads(hits_raw) # String, needs parsing
|
|
450
457
|
data.append({"input_string": input_string, "hits": hits})
|
|
451
458
|
|
|
452
459
|
return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
@@ -468,6 +475,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
468
475
|
Returns:
|
|
469
476
|
Dict mapping concept_id to ConceptInfo
|
|
470
477
|
"""
|
|
478
|
+
await self._ensure_initialized()
|
|
479
|
+
|
|
471
480
|
if not concept_ids:
|
|
472
481
|
return {}
|
|
473
482
|
|
|
@@ -491,20 +500,28 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
491
500
|
semantic_types=[],
|
|
492
501
|
)
|
|
493
502
|
|
|
494
|
-
# Build idmap VALUES clause
|
|
495
|
-
params:
|
|
503
|
+
# Build idmap VALUES clause using named parameters
|
|
504
|
+
params: dict[str, Any] = {}
|
|
505
|
+
param_idx = 0
|
|
496
506
|
idmap_placeholders = []
|
|
497
507
|
for cid in id_list:
|
|
498
|
-
|
|
499
|
-
params
|
|
500
|
-
idmap_placeholders.append(f"(
|
|
508
|
+
key = f"p{param_idx}"
|
|
509
|
+
params[key] = cid
|
|
510
|
+
idmap_placeholders.append(f"(:{key})")
|
|
511
|
+
param_idx += 1
|
|
501
512
|
idmap_values = ", ".join(idmap_placeholders)
|
|
502
513
|
|
|
503
514
|
# Build preference clauses
|
|
504
515
|
tty_join = ""
|
|
505
516
|
tty_bump = "0"
|
|
506
517
|
if prefer_ttys:
|
|
507
|
-
|
|
518
|
+
tty_placeholders = []
|
|
519
|
+
for tty in prefer_ttys:
|
|
520
|
+
key = f"p{param_idx}"
|
|
521
|
+
params[key] = tty
|
|
522
|
+
tty_placeholders.append(f"(:{key})")
|
|
523
|
+
param_idx += 1
|
|
524
|
+
tty_vals = ", ".join(tty_placeholders)
|
|
508
525
|
tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
|
|
509
526
|
tty_bump = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
|
|
510
527
|
|
|
@@ -582,8 +599,9 @@ LEFT JOIN syn_agg sa ON sa.concept_id = c.concept_id
|
|
|
582
599
|
ORDER BY c.concept_id;
|
|
583
600
|
"""
|
|
584
601
|
|
|
585
|
-
async with self.
|
|
586
|
-
|
|
602
|
+
async with self._engine.connect() as conn:
|
|
603
|
+
result = await conn.execute(text(sql), params)
|
|
604
|
+
rows = result.mappings().all()
|
|
587
605
|
|
|
588
606
|
for row in rows:
|
|
589
607
|
cid = row["concept_id"]
|
|
@@ -616,18 +634,26 @@ ORDER BY c.concept_id;
|
|
|
616
634
|
prefer_def_sources: list[str] | None,
|
|
617
635
|
) -> None:
|
|
618
636
|
"""Populate definitions for concepts."""
|
|
619
|
-
params:
|
|
637
|
+
params: dict[str, Any] = {}
|
|
638
|
+
param_idx = 0
|
|
620
639
|
idmap_placeholders = []
|
|
621
640
|
for cid in id_list:
|
|
622
|
-
|
|
623
|
-
params
|
|
624
|
-
idmap_placeholders.append(f"(
|
|
641
|
+
key = f"p{param_idx}"
|
|
642
|
+
params[key] = cid
|
|
643
|
+
idmap_placeholders.append(f"(:{key})")
|
|
644
|
+
param_idx += 1
|
|
625
645
|
idmap_values = ", ".join(idmap_placeholders)
|
|
626
646
|
|
|
627
647
|
def_pref_join = ""
|
|
628
648
|
def_pref_bump = "0"
|
|
629
649
|
if prefer_def_sources:
|
|
630
|
-
|
|
650
|
+
def_placeholders = []
|
|
651
|
+
for src in prefer_def_sources:
|
|
652
|
+
key = f"p{param_idx}"
|
|
653
|
+
params[key] = src
|
|
654
|
+
def_placeholders.append(f"(:{key})")
|
|
655
|
+
param_idx += 1
|
|
656
|
+
def_vals = ", ".join(def_placeholders)
|
|
631
657
|
def_pref_join = f"LEFT JOIN (VALUES {def_vals}) AS pds(sab) ON d.source = pds.sab"
|
|
632
658
|
def_pref_bump = "CASE WHEN pds.sab IS NULL THEN 0 ELSE 1 END"
|
|
633
659
|
|
|
@@ -656,8 +682,9 @@ FROM def_best
|
|
|
656
682
|
WHERE drn = 1;
|
|
657
683
|
"""
|
|
658
684
|
|
|
659
|
-
async with self.
|
|
660
|
-
|
|
685
|
+
async with self._engine.connect() as conn:
|
|
686
|
+
result = await conn.execute(text(sql), params)
|
|
687
|
+
rows = result.mappings().all()
|
|
661
688
|
|
|
662
689
|
for row in rows:
|
|
663
690
|
cid = row["concept_id"]
|
|
@@ -671,12 +698,12 @@ WHERE drn = 1;
|
|
|
671
698
|
id_list: list[str],
|
|
672
699
|
) -> None:
|
|
673
700
|
"""Populate semantic types for concepts."""
|
|
674
|
-
params:
|
|
701
|
+
params: dict[str, Any] = {}
|
|
675
702
|
idmap_placeholders = []
|
|
676
|
-
for cid in id_list:
|
|
677
|
-
|
|
678
|
-
params
|
|
679
|
-
idmap_placeholders.append(f"(
|
|
703
|
+
for i, cid in enumerate(id_list):
|
|
704
|
+
key = f"p{i}"
|
|
705
|
+
params[key] = cid
|
|
706
|
+
idmap_placeholders.append(f"(:{key})")
|
|
680
707
|
idmap_values = ", ".join(idmap_placeholders)
|
|
681
708
|
|
|
682
709
|
sql = f"""
|
|
@@ -687,8 +714,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
|
|
|
687
714
|
ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
688
715
|
"""
|
|
689
716
|
|
|
690
|
-
async with self.
|
|
691
|
-
|
|
717
|
+
async with self._engine.connect() as conn:
|
|
718
|
+
result = await conn.execute(text(sql), params)
|
|
719
|
+
rows = result.mappings().all()
|
|
692
720
|
|
|
693
721
|
for row in rows:
|
|
694
722
|
cid = row["concept_id"]
|
|
@@ -704,17 +732,19 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
704
732
|
|
|
705
733
|
Returns dict mapping concept_id to list of {"tui": ..., "sty": ...}
|
|
706
734
|
"""
|
|
735
|
+
await self._ensure_initialized()
|
|
736
|
+
|
|
707
737
|
if not self._has_types or not concept_ids:
|
|
708
738
|
return {cid: [] for cid in concept_ids}
|
|
709
739
|
|
|
710
740
|
id_list = list(dict.fromkeys(concept_ids))
|
|
711
741
|
|
|
712
|
-
params:
|
|
742
|
+
params: dict[str, Any] = {}
|
|
713
743
|
idmap_placeholders = []
|
|
714
|
-
for cid in id_list:
|
|
715
|
-
|
|
716
|
-
params
|
|
717
|
-
idmap_placeholders.append(f"(
|
|
744
|
+
for i, cid in enumerate(id_list):
|
|
745
|
+
key = f"p{i}"
|
|
746
|
+
params[key] = cid
|
|
747
|
+
idmap_placeholders.append(f"(:{key})")
|
|
718
748
|
idmap_values = ", ".join(idmap_placeholders)
|
|
719
749
|
|
|
720
750
|
sql = f"""
|
|
@@ -725,8 +755,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
|
|
|
725
755
|
ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
726
756
|
"""
|
|
727
757
|
|
|
728
|
-
async with self.
|
|
729
|
-
|
|
758
|
+
async with self._engine.connect() as conn:
|
|
759
|
+
result = await conn.execute(text(sql), params)
|
|
760
|
+
rows = result.mappings().all()
|
|
730
761
|
|
|
731
762
|
res: dict[str, list[dict[str, str]]] = {cid: [] for cid in id_list}
|
|
732
763
|
for row in rows:
|
|
@@ -753,88 +784,55 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
753
784
|
Returns:
|
|
754
785
|
List of descendant concept IDs (excludes the starting concept)
|
|
755
786
|
"""
|
|
787
|
+
await self._ensure_initialized()
|
|
788
|
+
|
|
756
789
|
if not self._has_edges:
|
|
757
790
|
return []
|
|
758
791
|
|
|
792
|
+
params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
|
|
793
|
+
|
|
759
794
|
# Build source filter clause
|
|
760
795
|
source_filter = ""
|
|
761
796
|
if filter_sources:
|
|
762
|
-
|
|
797
|
+
src_placeholders = []
|
|
798
|
+
for i, src in enumerate(filter_sources):
|
|
799
|
+
key = f"src{i}"
|
|
800
|
+
params[key] = src
|
|
801
|
+
src_placeholders.append(f":{key}")
|
|
802
|
+
sources_sql = ", ".join(src_placeholders)
|
|
763
803
|
source_filter = f" AND e.source IN ({sources_sql})"
|
|
764
804
|
|
|
765
|
-
# PostgreSQL recursive CTE
|
|
805
|
+
# PostgreSQL recursive CTE with named parameters
|
|
806
|
+
# Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
|
|
766
807
|
query = f"""
|
|
767
808
|
WITH RECURSIVE walk(concept_id, depth) AS (
|
|
768
|
-
SELECT
|
|
809
|
+
SELECT CAST(:concept_id AS VARCHAR), 0
|
|
769
810
|
|
|
770
811
|
UNION ALL
|
|
771
812
|
|
|
772
813
|
SELECT e.child_id, w.depth + 1
|
|
773
814
|
FROM walk w
|
|
774
815
|
JOIN {self._edges_table} e ON e.parent_id = w.concept_id
|
|
775
|
-
WHERE (
|
|
816
|
+
WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){source_filter}
|
|
776
817
|
)
|
|
777
818
|
SELECT DISTINCT concept_id
|
|
778
819
|
FROM walk
|
|
779
|
-
WHERE concept_id !=
|
|
820
|
+
WHERE concept_id != :concept_id
|
|
780
821
|
"""
|
|
781
822
|
|
|
782
|
-
async with self.
|
|
783
|
-
|
|
823
|
+
async with self._engine.connect() as conn:
|
|
824
|
+
result = await conn.execute(text(query), params)
|
|
825
|
+
rows = result.mappings().all()
|
|
784
826
|
|
|
785
827
|
return [r["concept_id"] for r in rows]
|
|
786
828
|
|
|
787
|
-
def normalize_sync(
|
|
788
|
-
self,
|
|
789
|
-
strings: Sequence[str],
|
|
790
|
-
top_k: int = 25,
|
|
791
|
-
prefer_ttys: list[str] | None = None,
|
|
792
|
-
filter_sources: list[str] | None = None,
|
|
793
|
-
exclude_sources: list[str] | None = None,
|
|
794
|
-
allow_partial: bool = True,
|
|
795
|
-
min_coverage: float = 0.6,
|
|
796
|
-
min_word_hits: int | None = None,
|
|
797
|
-
coverage_weight: int = 25,
|
|
798
|
-
) -> pl.DataFrame:
|
|
799
|
-
"""
|
|
800
|
-
Synchronous wrapper around normalize().
|
|
801
|
-
|
|
802
|
-
Requires the normalizer to be created with create_sync() factory method.
|
|
803
|
-
"""
|
|
804
|
-
if self._loop is None:
|
|
805
|
-
raise RuntimeError("normalize_sync() requires normalizer created with create_sync()")
|
|
806
|
-
|
|
807
|
-
return self._loop.run_until_complete(
|
|
808
|
-
self.normalize(
|
|
809
|
-
strings=strings,
|
|
810
|
-
top_k=top_k,
|
|
811
|
-
prefer_ttys=prefer_ttys,
|
|
812
|
-
filter_sources=filter_sources,
|
|
813
|
-
exclude_sources=exclude_sources,
|
|
814
|
-
allow_partial=allow_partial,
|
|
815
|
-
min_coverage=min_coverage,
|
|
816
|
-
min_word_hits=min_word_hits,
|
|
817
|
-
coverage_weight=coverage_weight,
|
|
818
|
-
)
|
|
819
|
-
)
|
|
820
|
-
|
|
821
829
|
async def close(self) -> None:
|
|
822
830
|
"""
|
|
823
|
-
Close the
|
|
831
|
+
Close the engine and any owned resources.
|
|
824
832
|
|
|
825
|
-
Note: Only call this if you want to close the
|
|
833
|
+
Note: Only call this if you want to close the engine. If the engine
|
|
826
834
|
is managed externally, the caller should close it instead.
|
|
827
835
|
"""
|
|
828
|
-
await self.
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
"""
|
|
832
|
-
Synchronously close the connection pool and event loop.
|
|
833
|
-
|
|
834
|
-
Use this when the normalizer was created with create_sync().
|
|
835
|
-
"""
|
|
836
|
-
if self._loop is None:
|
|
837
|
-
raise RuntimeError("close_sync() requires normalizer created with create_sync()")
|
|
838
|
-
|
|
839
|
-
self._loop.run_until_complete(self._pool.close())
|
|
840
|
-
self._loop.close()
|
|
836
|
+
await self._engine.dispose()
|
|
837
|
+
if self._owned_resource is not None:
|
|
838
|
+
await self._owned_resource.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|