norm_toolkit 1.0.2__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/PKG-INFO +2 -1
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/pyproject.toml +2 -1
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer_postgres.py +146 -159
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/README.md +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/constants.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer.py +0 -0
- {norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: norm_toolkit
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Toolkit to normalize text to UMLS / ontologies
|
|
5
5
|
Author: Haydn Jones
|
|
6
6
|
Author-email: Haydn Jones <haydnjonest@gmail.com>
|
|
@@ -10,6 +10,7 @@ Requires-Dist: lvg-norm>=1.1.0
|
|
|
10
10
|
Requires-Dist: polars[rt64]>=1.36.1
|
|
11
11
|
Requires-Dist: pyarrow>=20.0.0
|
|
12
12
|
Requires-Dist: pydantic>=2.12.5
|
|
13
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
13
14
|
Requires-Dist: tqdm>=4.67.1
|
|
14
15
|
Requires-Python: >=3.12
|
|
15
16
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "norm_toolkit"
|
|
3
|
-
version = "1.0
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "Toolkit to normalize text to UMLS / ontologies"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
|
|
@@ -12,6 +12,7 @@ dependencies = [
|
|
|
12
12
|
"polars[rt64]>=1.36.1",
|
|
13
13
|
"pyarrow>=20.0.0",
|
|
14
14
|
"pydantic>=2.12.5",
|
|
15
|
+
"sqlalchemy>=2.0.0",
|
|
15
16
|
"tqdm>=4.67.1",
|
|
16
17
|
]
|
|
17
18
|
|
|
@@ -7,18 +7,17 @@ built by build_umls_duckdb, build_ontology_duckdb, or build_merged_duckdb.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
import asyncio
|
|
11
10
|
import json
|
|
12
11
|
from collections.abc import Mapping, Sequence
|
|
13
12
|
from typing import Any
|
|
14
13
|
|
|
15
|
-
import asyncpg
|
|
16
14
|
import polars as pl
|
|
17
15
|
from lvg_norm import lvg_normalize
|
|
16
|
+
from sqlalchemy import text
|
|
17
|
+
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
18
18
|
|
|
19
19
|
from norm_toolkit.constants import (
|
|
20
20
|
ATOMS_TABLE,
|
|
21
|
-
CONCEPTS_TABLE,
|
|
22
21
|
DEFAULT_PREFER_TTYS,
|
|
23
22
|
DEFS_TABLE,
|
|
24
23
|
EDGES_TABLE,
|
|
@@ -37,7 +36,7 @@ from norm_toolkit.models import ConceptInfo, SemanticType
|
|
|
37
36
|
|
|
38
37
|
class PostgresNormalizer:
|
|
39
38
|
"""
|
|
40
|
-
Async normalizer using PostgreSQL via
|
|
39
|
+
Async normalizer using PostgreSQL via SQLAlchemy.
|
|
41
40
|
|
|
42
41
|
Optimized for small batch processing (1-5 strings at a time).
|
|
43
42
|
Uses VALUES clauses instead of temp tables for efficiency with small batches.
|
|
@@ -45,15 +44,15 @@ class PostgresNormalizer:
|
|
|
45
44
|
|
|
46
45
|
def __init__(
|
|
47
46
|
self,
|
|
48
|
-
|
|
47
|
+
engine: AsyncEngine,
|
|
49
48
|
schema: str = "public",
|
|
50
49
|
owned_resource: Any | None = None,
|
|
51
50
|
) -> None:
|
|
52
51
|
"""
|
|
53
|
-
Initialize the normalizer with an
|
|
52
|
+
Initialize the normalizer with an SQLAlchemy AsyncEngine.
|
|
54
53
|
|
|
55
54
|
Args:
|
|
56
|
-
|
|
55
|
+
engine: SQLAlchemy AsyncEngine (caller manages lifecycle)
|
|
57
56
|
schema: PostgreSQL schema where tables are located (default: "public")
|
|
58
57
|
owned_resource: Optional resource with async close() method to clean up
|
|
59
58
|
when this normalizer is closed (e.g., AlloyDB AsyncConnector)
|
|
@@ -62,9 +61,8 @@ class PostgresNormalizer:
|
|
|
62
61
|
After creating the normalizer, call `await normalizer.initialize()`
|
|
63
62
|
to detect database capabilities before using other methods.
|
|
64
63
|
"""
|
|
65
|
-
self.
|
|
64
|
+
self._engine = engine
|
|
66
65
|
self._schema = schema
|
|
67
|
-
self._loop: asyncio.AbstractEventLoop | None = None
|
|
68
66
|
self._owned_resource = owned_resource
|
|
69
67
|
self._has_types = False
|
|
70
68
|
self._has_defs = False
|
|
@@ -77,48 +75,14 @@ class PostgresNormalizer:
|
|
|
77
75
|
self._ns_table = f"{prefix}{NS_TABLE}"
|
|
78
76
|
self._nw_table = f"{prefix}{NW_TABLE}"
|
|
79
77
|
self._atoms_table = f"{prefix}{ATOMS_TABLE}"
|
|
80
|
-
self._concepts_table = f"{prefix}{CONCEPTS_TABLE}"
|
|
81
78
|
self._types_table = f"{prefix}{TYPES_TABLE}"
|
|
82
79
|
self._defs_table = f"{prefix}{DEFS_TABLE}"
|
|
83
80
|
self._edges_table = f"{prefix}{EDGES_TABLE}"
|
|
84
81
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
Use this factory method for sync-only usage. The normalizer will manage
|
|
91
|
-
its own event loop and pool, allowing you to call normalize_sync().
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
dsn: PostgreSQL connection string (e.g., "postgresql://user:pass@host:5432/db")
|
|
95
|
-
schema: PostgreSQL schema where tables are located (default: "public")
|
|
96
|
-
min_size: Minimum pool connections
|
|
97
|
-
max_size: Maximum pool connections
|
|
98
|
-
|
|
99
|
-
Example:
|
|
100
|
-
>>> normalizer = PostgresNormalizer.create_sync("postgresql://...")
|
|
101
|
-
>>> result = normalizer.normalize_sync(["diabetes"])
|
|
102
|
-
>>> normalizer.close_sync()
|
|
103
|
-
"""
|
|
104
|
-
loop = asyncio.new_event_loop()
|
|
105
|
-
|
|
106
|
-
async def _create():
|
|
107
|
-
pool = await asyncpg.create_pool(dsn, min_size=min_size, max_size=max_size)
|
|
108
|
-
return pool
|
|
109
|
-
|
|
110
|
-
pool = loop.run_until_complete(_create())
|
|
111
|
-
instance = cls(pool, schema=schema)
|
|
112
|
-
instance._loop = loop
|
|
113
|
-
loop.run_until_complete(instance.initialize())
|
|
114
|
-
return instance
|
|
115
|
-
|
|
116
|
-
async def initialize(self) -> None:
|
|
117
|
-
"""
|
|
118
|
-
Detect database capabilities.
|
|
119
|
-
|
|
120
|
-
Must be called after __init__ before using normalize/concept_info methods.
|
|
121
|
-
"""
|
|
82
|
+
async def _ensure_initialized(self) -> None:
|
|
83
|
+
"""Lazily initialize on first use."""
|
|
84
|
+
if self._initialized:
|
|
85
|
+
return
|
|
122
86
|
self._has_types = await self._table_has_rows(self._types_table)
|
|
123
87
|
self._has_defs = await self._table_has_rows(self._defs_table)
|
|
124
88
|
self._has_edges = await self._table_has_rows(self._edges_table)
|
|
@@ -128,18 +92,18 @@ class PostgresNormalizer:
|
|
|
128
92
|
async def _table_has_rows(self, table: str) -> bool:
|
|
129
93
|
"""Check if a table exists and has rows."""
|
|
130
94
|
try:
|
|
131
|
-
async with self.
|
|
132
|
-
result = await
|
|
133
|
-
return result is not None
|
|
95
|
+
async with self._engine.connect() as conn:
|
|
96
|
+
result = await conn.execute(text(f"SELECT 1 FROM {table} LIMIT 1"))
|
|
97
|
+
return result.scalar() is not None
|
|
134
98
|
except Exception:
|
|
135
99
|
return False
|
|
136
100
|
|
|
137
101
|
async def _column_has_values(self, table: str, column: str) -> bool:
|
|
138
102
|
"""Check if a column has any non-null values."""
|
|
139
103
|
try:
|
|
140
|
-
async with self.
|
|
141
|
-
result = await
|
|
142
|
-
return result is not None
|
|
104
|
+
async with self._engine.connect() as conn:
|
|
105
|
+
result = await conn.execute(text(f"SELECT 1 FROM {table} WHERE {column} IS NOT NULL LIMIT 1"))
|
|
106
|
+
return result.scalar() is not None
|
|
143
107
|
except Exception:
|
|
144
108
|
return False
|
|
145
109
|
|
|
@@ -172,6 +136,8 @@ class PostgresNormalizer:
|
|
|
172
136
|
Returns:
|
|
173
137
|
DataFrame with columns: input_string, hits (list of match structs)
|
|
174
138
|
"""
|
|
139
|
+
await self._ensure_initialized()
|
|
140
|
+
|
|
175
141
|
if prefer_ttys is None:
|
|
176
142
|
prefer_ttys = DEFAULT_PREFER_TTYS
|
|
177
143
|
|
|
@@ -223,15 +189,18 @@ class PostgresNormalizer:
|
|
|
223
189
|
{"hits": pl.List(HIT_STRUCT_TYPE)}
|
|
224
190
|
)
|
|
225
191
|
|
|
226
|
-
# Build parameters and VALUES clauses
|
|
227
|
-
params:
|
|
192
|
+
# Build parameters and VALUES clauses using named parameters
|
|
193
|
+
params: dict[str, Any] = {}
|
|
194
|
+
param_idx = 0
|
|
228
195
|
|
|
229
196
|
# qmap VALUES clause
|
|
230
197
|
qmap_placeholders = []
|
|
231
198
|
for q, nstr in qmap_rows:
|
|
232
|
-
|
|
233
|
-
params
|
|
234
|
-
|
|
199
|
+
q_key, nstr_key = f"p{param_idx}", f"p{param_idx + 1}"
|
|
200
|
+
params[q_key] = q
|
|
201
|
+
params[nstr_key] = nstr
|
|
202
|
+
qmap_placeholders.append(f"(:{q_key}, :{nstr_key})")
|
|
203
|
+
param_idx += 2
|
|
235
204
|
qmap_values = ", ".join(qmap_placeholders)
|
|
236
205
|
|
|
237
206
|
# qwords VALUES clause (for partial path)
|
|
@@ -240,36 +209,58 @@ class PostgresNormalizer:
|
|
|
240
209
|
qwords_rows = [(q, n, w) for q, n in qmap_rows for w in dict.fromkeys(n.split()) if w]
|
|
241
210
|
qwords_placeholders = []
|
|
242
211
|
for q, nstr, nwd in qwords_rows:
|
|
243
|
-
|
|
244
|
-
params
|
|
245
|
-
|
|
212
|
+
q_key, nstr_key, nwd_key = f"p{param_idx}", f"p{param_idx + 1}", f"p{param_idx + 2}"
|
|
213
|
+
params[q_key] = q
|
|
214
|
+
params[nstr_key] = nstr
|
|
215
|
+
params[nwd_key] = nwd
|
|
216
|
+
qwords_placeholders.append(f"(:{q_key}, :{nstr_key}, :{nwd_key})")
|
|
217
|
+
param_idx += 3
|
|
246
218
|
qwords_values = ", ".join(qwords_placeholders)
|
|
247
219
|
|
|
248
220
|
# allq VALUES clause (preserve order)
|
|
249
221
|
allq_placeholders = []
|
|
250
222
|
for q in all_queries:
|
|
251
|
-
|
|
252
|
-
params
|
|
253
|
-
allq_placeholders.append(f"(
|
|
223
|
+
q_key = f"p{param_idx}"
|
|
224
|
+
params[q_key] = q
|
|
225
|
+
allq_placeholders.append(f"(:{q_key})")
|
|
226
|
+
param_idx += 1
|
|
254
227
|
allq_values = ", ".join(allq_placeholders)
|
|
255
228
|
|
|
256
|
-
# Build preference clauses
|
|
229
|
+
# Build preference clauses (parameterized to prevent SQL injection)
|
|
257
230
|
tty_join = ""
|
|
258
231
|
tty_bump_expr = "0"
|
|
259
232
|
if prefer_ttys:
|
|
260
|
-
|
|
233
|
+
tty_placeholders = []
|
|
234
|
+
for tty in prefer_ttys:
|
|
235
|
+
key = f"p{param_idx}"
|
|
236
|
+
params[key] = tty
|
|
237
|
+
tty_placeholders.append(f"(:{key})")
|
|
238
|
+
param_idx += 1
|
|
239
|
+
tty_vals = ", ".join(tty_placeholders)
|
|
261
240
|
tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
|
|
262
241
|
tty_bump_expr = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
|
|
263
242
|
|
|
264
|
-
# Source filtering
|
|
243
|
+
# Source filtering (parameterized to prevent SQL injection)
|
|
265
244
|
source_filter_exprs = []
|
|
266
245
|
nw_filter_clauses = []
|
|
267
246
|
if filter_sources:
|
|
268
|
-
|
|
247
|
+
filt_placeholders = []
|
|
248
|
+
for src in filter_sources:
|
|
249
|
+
key = f"p{param_idx}"
|
|
250
|
+
params[key] = src
|
|
251
|
+
filt_placeholders.append(f":{key}")
|
|
252
|
+
param_idx += 1
|
|
253
|
+
filt_vals = ", ".join(filt_placeholders)
|
|
269
254
|
source_filter_exprs.append(f"a.source IN ({filt_vals})")
|
|
270
255
|
nw_filter_clauses.append(f"nw.source IN ({filt_vals})")
|
|
271
256
|
if exclude_sources:
|
|
272
|
-
|
|
257
|
+
excl_placeholders = []
|
|
258
|
+
for src in exclude_sources:
|
|
259
|
+
key = f"p{param_idx}"
|
|
260
|
+
params[key] = src
|
|
261
|
+
excl_placeholders.append(f":{key}")
|
|
262
|
+
param_idx += 1
|
|
263
|
+
excl_vals = ", ".join(excl_placeholders)
|
|
273
264
|
source_filter_exprs.append(f"a.source NOT IN ({excl_vals})")
|
|
274
265
|
nw_filter_clauses.append(f"nw.source NOT IN ({excl_vals})")
|
|
275
266
|
nw_filter_clause = (" AND " + " AND ".join(nw_filter_clauses)) if nw_filter_clauses else ""
|
|
@@ -447,15 +438,22 @@ FROM allq aq
|
|
|
447
438
|
LEFT JOIN agg ON agg.Q = aq.Q;
|
|
448
439
|
"""
|
|
449
440
|
|
|
450
|
-
async with self.
|
|
451
|
-
|
|
441
|
+
async with self._engine.connect() as conn:
|
|
442
|
+
result = await conn.execute(text(sql), params)
|
|
443
|
+
rows = result.mappings().all()
|
|
452
444
|
|
|
453
|
-
# Parse
|
|
445
|
+
# Parse results into Polars DataFrame
|
|
446
|
+
# Note: asyncpg auto-deserializes JSON, so hits may already be a list
|
|
454
447
|
data = []
|
|
455
448
|
for row in rows:
|
|
456
449
|
input_string = row["input_string"]
|
|
457
|
-
|
|
458
|
-
|
|
450
|
+
hits_raw = row["hits"]
|
|
451
|
+
if hits_raw is None:
|
|
452
|
+
hits = []
|
|
453
|
+
elif isinstance(hits_raw, list):
|
|
454
|
+
hits = hits_raw # Already deserialized by asyncpg
|
|
455
|
+
else:
|
|
456
|
+
hits = json.loads(hits_raw) # String, needs parsing
|
|
459
457
|
data.append({"input_string": input_string, "hits": hits})
|
|
460
458
|
|
|
461
459
|
return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
@@ -477,6 +475,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
477
475
|
Returns:
|
|
478
476
|
Dict mapping concept_id to ConceptInfo
|
|
479
477
|
"""
|
|
478
|
+
await self._ensure_initialized()
|
|
479
|
+
|
|
480
480
|
if not concept_ids:
|
|
481
481
|
return {}
|
|
482
482
|
|
|
@@ -500,20 +500,28 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
500
500
|
semantic_types=[],
|
|
501
501
|
)
|
|
502
502
|
|
|
503
|
-
# Build idmap VALUES clause
|
|
504
|
-
params:
|
|
503
|
+
# Build idmap VALUES clause using named parameters
|
|
504
|
+
params: dict[str, Any] = {}
|
|
505
|
+
param_idx = 0
|
|
505
506
|
idmap_placeholders = []
|
|
506
507
|
for cid in id_list:
|
|
507
|
-
|
|
508
|
-
params
|
|
509
|
-
idmap_placeholders.append(f"(
|
|
508
|
+
key = f"p{param_idx}"
|
|
509
|
+
params[key] = cid
|
|
510
|
+
idmap_placeholders.append(f"(:{key})")
|
|
511
|
+
param_idx += 1
|
|
510
512
|
idmap_values = ", ".join(idmap_placeholders)
|
|
511
513
|
|
|
512
514
|
# Build preference clauses
|
|
513
515
|
tty_join = ""
|
|
514
516
|
tty_bump = "0"
|
|
515
517
|
if prefer_ttys:
|
|
516
|
-
|
|
518
|
+
tty_placeholders = []
|
|
519
|
+
for tty in prefer_ttys:
|
|
520
|
+
key = f"p{param_idx}"
|
|
521
|
+
params[key] = tty
|
|
522
|
+
tty_placeholders.append(f"(:{key})")
|
|
523
|
+
param_idx += 1
|
|
524
|
+
tty_vals = ", ".join(tty_placeholders)
|
|
517
525
|
tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
|
|
518
526
|
tty_bump = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
|
|
519
527
|
|
|
@@ -591,8 +599,9 @@ LEFT JOIN syn_agg sa ON sa.concept_id = c.concept_id
|
|
|
591
599
|
ORDER BY c.concept_id;
|
|
592
600
|
"""
|
|
593
601
|
|
|
594
|
-
async with self.
|
|
595
|
-
|
|
602
|
+
async with self._engine.connect() as conn:
|
|
603
|
+
result = await conn.execute(text(sql), params)
|
|
604
|
+
rows = result.mappings().all()
|
|
596
605
|
|
|
597
606
|
for row in rows:
|
|
598
607
|
cid = row["concept_id"]
|
|
@@ -625,18 +634,26 @@ ORDER BY c.concept_id;
|
|
|
625
634
|
prefer_def_sources: list[str] | None,
|
|
626
635
|
) -> None:
|
|
627
636
|
"""Populate definitions for concepts."""
|
|
628
|
-
params:
|
|
637
|
+
params: dict[str, Any] = {}
|
|
638
|
+
param_idx = 0
|
|
629
639
|
idmap_placeholders = []
|
|
630
640
|
for cid in id_list:
|
|
631
|
-
|
|
632
|
-
params
|
|
633
|
-
idmap_placeholders.append(f"(
|
|
641
|
+
key = f"p{param_idx}"
|
|
642
|
+
params[key] = cid
|
|
643
|
+
idmap_placeholders.append(f"(:{key})")
|
|
644
|
+
param_idx += 1
|
|
634
645
|
idmap_values = ", ".join(idmap_placeholders)
|
|
635
646
|
|
|
636
647
|
def_pref_join = ""
|
|
637
648
|
def_pref_bump = "0"
|
|
638
649
|
if prefer_def_sources:
|
|
639
|
-
|
|
650
|
+
def_placeholders = []
|
|
651
|
+
for src in prefer_def_sources:
|
|
652
|
+
key = f"p{param_idx}"
|
|
653
|
+
params[key] = src
|
|
654
|
+
def_placeholders.append(f"(:{key})")
|
|
655
|
+
param_idx += 1
|
|
656
|
+
def_vals = ", ".join(def_placeholders)
|
|
640
657
|
def_pref_join = f"LEFT JOIN (VALUES {def_vals}) AS pds(sab) ON d.source = pds.sab"
|
|
641
658
|
def_pref_bump = "CASE WHEN pds.sab IS NULL THEN 0 ELSE 1 END"
|
|
642
659
|
|
|
@@ -665,8 +682,9 @@ FROM def_best
|
|
|
665
682
|
WHERE drn = 1;
|
|
666
683
|
"""
|
|
667
684
|
|
|
668
|
-
async with self.
|
|
669
|
-
|
|
685
|
+
async with self._engine.connect() as conn:
|
|
686
|
+
result = await conn.execute(text(sql), params)
|
|
687
|
+
rows = result.mappings().all()
|
|
670
688
|
|
|
671
689
|
for row in rows:
|
|
672
690
|
cid = row["concept_id"]
|
|
@@ -680,12 +698,12 @@ WHERE drn = 1;
|
|
|
680
698
|
id_list: list[str],
|
|
681
699
|
) -> None:
|
|
682
700
|
"""Populate semantic types for concepts."""
|
|
683
|
-
params:
|
|
701
|
+
params: dict[str, Any] = {}
|
|
684
702
|
idmap_placeholders = []
|
|
685
|
-
for cid in id_list:
|
|
686
|
-
|
|
687
|
-
params
|
|
688
|
-
idmap_placeholders.append(f"(
|
|
703
|
+
for i, cid in enumerate(id_list):
|
|
704
|
+
key = f"p{i}"
|
|
705
|
+
params[key] = cid
|
|
706
|
+
idmap_placeholders.append(f"(:{key})")
|
|
689
707
|
idmap_values = ", ".join(idmap_placeholders)
|
|
690
708
|
|
|
691
709
|
sql = f"""
|
|
@@ -696,8 +714,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
|
|
|
696
714
|
ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
697
715
|
"""
|
|
698
716
|
|
|
699
|
-
async with self.
|
|
700
|
-
|
|
717
|
+
async with self._engine.connect() as conn:
|
|
718
|
+
result = await conn.execute(text(sql), params)
|
|
719
|
+
rows = result.mappings().all()
|
|
701
720
|
|
|
702
721
|
for row in rows:
|
|
703
722
|
cid = row["concept_id"]
|
|
@@ -713,17 +732,19 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
713
732
|
|
|
714
733
|
Returns dict mapping concept_id to list of {"tui": ..., "sty": ...}
|
|
715
734
|
"""
|
|
735
|
+
await self._ensure_initialized()
|
|
736
|
+
|
|
716
737
|
if not self._has_types or not concept_ids:
|
|
717
738
|
return {cid: [] for cid in concept_ids}
|
|
718
739
|
|
|
719
740
|
id_list = list(dict.fromkeys(concept_ids))
|
|
720
741
|
|
|
721
|
-
params:
|
|
742
|
+
params: dict[str, Any] = {}
|
|
722
743
|
idmap_placeholders = []
|
|
723
|
-
for cid in id_list:
|
|
724
|
-
|
|
725
|
-
params
|
|
726
|
-
idmap_placeholders.append(f"(
|
|
744
|
+
for i, cid in enumerate(id_list):
|
|
745
|
+
key = f"p{i}"
|
|
746
|
+
params[key] = cid
|
|
747
|
+
idmap_placeholders.append(f"(:{key})")
|
|
727
748
|
idmap_values = ", ".join(idmap_placeholders)
|
|
728
749
|
|
|
729
750
|
sql = f"""
|
|
@@ -734,8 +755,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
|
|
|
734
755
|
ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
735
756
|
"""
|
|
736
757
|
|
|
737
|
-
async with self.
|
|
738
|
-
|
|
758
|
+
async with self._engine.connect() as conn:
|
|
759
|
+
result = await conn.execute(text(sql), params)
|
|
760
|
+
rows = result.mappings().all()
|
|
739
761
|
|
|
740
762
|
res: dict[str, list[dict[str, str]]] = {cid: [] for cid in id_list}
|
|
741
763
|
for row in rows:
|
|
@@ -762,90 +784,55 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
762
784
|
Returns:
|
|
763
785
|
List of descendant concept IDs (excludes the starting concept)
|
|
764
786
|
"""
|
|
787
|
+
await self._ensure_initialized()
|
|
788
|
+
|
|
765
789
|
if not self._has_edges:
|
|
766
790
|
return []
|
|
767
791
|
|
|
792
|
+
params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
|
|
793
|
+
|
|
768
794
|
# Build source filter clause
|
|
769
795
|
source_filter = ""
|
|
770
796
|
if filter_sources:
|
|
771
|
-
|
|
797
|
+
src_placeholders = []
|
|
798
|
+
for i, src in enumerate(filter_sources):
|
|
799
|
+
key = f"src{i}"
|
|
800
|
+
params[key] = src
|
|
801
|
+
src_placeholders.append(f":{key}")
|
|
802
|
+
sources_sql = ", ".join(src_placeholders)
|
|
772
803
|
source_filter = f" AND e.source IN ({sources_sql})"
|
|
773
804
|
|
|
774
|
-
# PostgreSQL recursive CTE
|
|
805
|
+
# PostgreSQL recursive CTE with named parameters
|
|
806
|
+
# Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
|
|
775
807
|
query = f"""
|
|
776
808
|
WITH RECURSIVE walk(concept_id, depth) AS (
|
|
777
|
-
SELECT
|
|
809
|
+
SELECT CAST(:concept_id AS VARCHAR), 0
|
|
778
810
|
|
|
779
811
|
UNION ALL
|
|
780
812
|
|
|
781
813
|
SELECT e.child_id, w.depth + 1
|
|
782
814
|
FROM walk w
|
|
783
815
|
JOIN {self._edges_table} e ON e.parent_id = w.concept_id
|
|
784
|
-
WHERE (
|
|
816
|
+
WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){source_filter}
|
|
785
817
|
)
|
|
786
818
|
SELECT DISTINCT concept_id
|
|
787
819
|
FROM walk
|
|
788
|
-
WHERE concept_id !=
|
|
820
|
+
WHERE concept_id != :concept_id
|
|
789
821
|
"""
|
|
790
822
|
|
|
791
|
-
async with self.
|
|
792
|
-
|
|
823
|
+
async with self._engine.connect() as conn:
|
|
824
|
+
result = await conn.execute(text(query), params)
|
|
825
|
+
rows = result.mappings().all()
|
|
793
826
|
|
|
794
827
|
return [r["concept_id"] for r in rows]
|
|
795
828
|
|
|
796
|
-
def normalize_sync(
|
|
797
|
-
self,
|
|
798
|
-
strings: Sequence[str],
|
|
799
|
-
top_k: int = 25,
|
|
800
|
-
prefer_ttys: list[str] | None = None,
|
|
801
|
-
filter_sources: list[str] | None = None,
|
|
802
|
-
exclude_sources: list[str] | None = None,
|
|
803
|
-
allow_partial: bool = True,
|
|
804
|
-
min_coverage: float = 0.6,
|
|
805
|
-
min_word_hits: int | None = None,
|
|
806
|
-
coverage_weight: int = 25,
|
|
807
|
-
) -> pl.DataFrame:
|
|
808
|
-
"""
|
|
809
|
-
Synchronous wrapper around normalize().
|
|
810
|
-
|
|
811
|
-
Requires the normalizer to be created with create_sync() factory method.
|
|
812
|
-
"""
|
|
813
|
-
if self._loop is None:
|
|
814
|
-
raise RuntimeError("normalize_sync() requires normalizer created with create_sync()")
|
|
815
|
-
|
|
816
|
-
return self._loop.run_until_complete(
|
|
817
|
-
self.normalize(
|
|
818
|
-
strings=strings,
|
|
819
|
-
top_k=top_k,
|
|
820
|
-
prefer_ttys=prefer_ttys,
|
|
821
|
-
filter_sources=filter_sources,
|
|
822
|
-
exclude_sources=exclude_sources,
|
|
823
|
-
allow_partial=allow_partial,
|
|
824
|
-
min_coverage=min_coverage,
|
|
825
|
-
min_word_hits=min_word_hits,
|
|
826
|
-
coverage_weight=coverage_weight,
|
|
827
|
-
)
|
|
828
|
-
)
|
|
829
|
-
|
|
830
829
|
async def close(self) -> None:
|
|
831
830
|
"""
|
|
832
|
-
Close the
|
|
831
|
+
Close the engine and any owned resources.
|
|
833
832
|
|
|
834
|
-
Note: Only call this if you want to close the
|
|
833
|
+
Note: Only call this if you want to close the engine. If the engine
|
|
835
834
|
is managed externally, the caller should close it instead.
|
|
836
835
|
"""
|
|
837
|
-
await self.
|
|
836
|
+
await self._engine.dispose()
|
|
838
837
|
if self._owned_resource is not None:
|
|
839
838
|
await self._owned_resource.close()
|
|
840
|
-
|
|
841
|
-
def close_sync(self) -> None:
|
|
842
|
-
"""
|
|
843
|
-
Synchronously close the connection pool and event loop.
|
|
844
|
-
|
|
845
|
-
Use this when the normalizer was created with create_sync().
|
|
846
|
-
"""
|
|
847
|
-
if self._loop is None:
|
|
848
|
-
raise RuntimeError("close_sync() requires normalizer created with create_sync()")
|
|
849
|
-
|
|
850
|
-
self._loop.run_until_complete(self._pool.close())
|
|
851
|
-
self._loop.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|