norm_toolkit 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/PKG-INFO +1 -1
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/pyproject.toml +1 -1
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/normalizer.py +20 -2
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/normalizer_postgres.py +23 -3
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/README.md +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/constants.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.1.0 → norm_toolkit-1.2.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -324,6 +324,7 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
324
324
|
def normalize(
|
|
325
325
|
self,
|
|
326
326
|
strings: Sequence[str],
|
|
327
|
+
synonyms: Mapping[str, Sequence[str]] | None = None,
|
|
327
328
|
top_k: int = 25,
|
|
328
329
|
prefer_ttys: list[str] | None = None,
|
|
329
330
|
filter_sources: list[str] | None = None,
|
|
@@ -338,6 +339,10 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
338
339
|
|
|
339
340
|
Args:
|
|
340
341
|
strings: Input strings to normalize
|
|
342
|
+
synonyms: Optional mapping of input strings to their synonyms.
|
|
343
|
+
Synonyms are normalized and used alongside the main string
|
|
344
|
+
to improve matching. Results are still keyed by the original
|
|
345
|
+
input string.
|
|
341
346
|
top_k: Maximum number of results per query
|
|
342
347
|
prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
|
|
343
348
|
filter_sources: Restrict to these sources (include only)
|
|
@@ -348,7 +353,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
348
353
|
coverage_weight: Weight for coverage in scoring
|
|
349
354
|
|
|
350
355
|
Returns:
|
|
351
|
-
DataFrame with columns: input_string, hits (list of match structs)
|
|
356
|
+
DataFrame with columns: input_string, hits (list of match structs),
|
|
357
|
+
and synonyms (list of strings) if synonyms were provided.
|
|
352
358
|
"""
|
|
353
359
|
# Apply defaults
|
|
354
360
|
if prefer_ttys is None:
|
|
@@ -358,9 +364,14 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
358
364
|
q_to_nstrs: dict[str, list[str]] = {}
|
|
359
365
|
for s in strings:
|
|
360
366
|
nstrs = list(lvg_normalize(s) or [])
|
|
367
|
+
# Add normalized forms of synonyms
|
|
368
|
+
if synonyms and s in synonyms:
|
|
369
|
+
for syn in synonyms[s]:
|
|
370
|
+
syn_nstrs = list(lvg_normalize(syn) or [])
|
|
371
|
+
nstrs.extend(syn_nstrs)
|
|
361
372
|
q_to_nstrs[s] = nstrs
|
|
362
373
|
|
|
363
|
-
|
|
374
|
+
result = self._lookup(
|
|
364
375
|
q_to_nstrs=q_to_nstrs,
|
|
365
376
|
all_queries=list(strings),
|
|
366
377
|
prefer_ttys=prefer_ttys,
|
|
@@ -373,6 +384,13 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
373
384
|
coverage_weight=coverage_weight,
|
|
374
385
|
)
|
|
375
386
|
|
|
387
|
+
# Add synonyms column if synonyms were provided
|
|
388
|
+
if synonyms:
|
|
389
|
+
syn_list = [list(synonyms.get(s, [])) for s in strings]
|
|
390
|
+
result = result.with_columns(pl.Series("synonyms", syn_list))
|
|
391
|
+
|
|
392
|
+
return result
|
|
393
|
+
|
|
376
394
|
def concept_info(
|
|
377
395
|
self,
|
|
378
396
|
concept_ids: Sequence[str],
|
|
@@ -110,6 +110,7 @@ class PostgresNormalizer:
|
|
|
110
110
|
async def normalize(
|
|
111
111
|
self,
|
|
112
112
|
strings: Sequence[str],
|
|
113
|
+
synonyms: Mapping[str, Sequence[str]] | None = None,
|
|
113
114
|
top_k: int = 25,
|
|
114
115
|
prefer_ttys: list[str] | None = None,
|
|
115
116
|
filter_sources: list[str] | None = None,
|
|
@@ -124,6 +125,10 @@ class PostgresNormalizer:
|
|
|
124
125
|
|
|
125
126
|
Args:
|
|
126
127
|
strings: Input strings to normalize
|
|
128
|
+
synonyms: Optional mapping of input strings to their synonyms.
|
|
129
|
+
Synonyms are normalized and used alongside the main string
|
|
130
|
+
to improve matching. Results are still keyed by the original
|
|
131
|
+
input string.
|
|
127
132
|
top_k: Maximum number of results per query
|
|
128
133
|
prefer_ttys: Term types to prefer (e.g., ["PT", "MH"])
|
|
129
134
|
filter_sources: Restrict to these sources (include only)
|
|
@@ -134,7 +139,8 @@ class PostgresNormalizer:
|
|
|
134
139
|
coverage_weight: Weight for coverage in scoring
|
|
135
140
|
|
|
136
141
|
Returns:
|
|
137
|
-
DataFrame with columns: input_string, hits (list of match structs)
|
|
142
|
+
DataFrame with columns: input_string, hits (list of match structs),
|
|
143
|
+
and synonyms (list of strings) if synonyms were provided.
|
|
138
144
|
"""
|
|
139
145
|
await self._ensure_initialized()
|
|
140
146
|
|
|
@@ -145,9 +151,14 @@ class PostgresNormalizer:
|
|
|
145
151
|
q_to_nstrs: dict[str, list[str]] = {}
|
|
146
152
|
for s in strings:
|
|
147
153
|
nstrs = list(lvg_normalize(s) or [])
|
|
154
|
+
# Add normalized forms of synonyms
|
|
155
|
+
if synonyms and s in synonyms:
|
|
156
|
+
for syn in synonyms[s]:
|
|
157
|
+
syn_nstrs = list(lvg_normalize(syn) or [])
|
|
158
|
+
nstrs.extend(syn_nstrs)
|
|
148
159
|
q_to_nstrs[s] = nstrs
|
|
149
160
|
|
|
150
|
-
|
|
161
|
+
result = await self._lookup(
|
|
151
162
|
q_to_nstrs=q_to_nstrs,
|
|
152
163
|
all_queries=list(strings),
|
|
153
164
|
prefer_ttys=prefer_ttys,
|
|
@@ -160,6 +171,13 @@ class PostgresNormalizer:
|
|
|
160
171
|
coverage_weight=coverage_weight,
|
|
161
172
|
)
|
|
162
173
|
|
|
174
|
+
# Add synonyms column if synonyms were provided
|
|
175
|
+
if synonyms:
|
|
176
|
+
syn_list = [list(synonyms.get(s, [])) for s in strings]
|
|
177
|
+
result = result.with_columns(pl.Series("synonyms", syn_list))
|
|
178
|
+
|
|
179
|
+
return result
|
|
180
|
+
|
|
163
181
|
async def _lookup(
|
|
164
182
|
self,
|
|
165
183
|
q_to_nstrs: Mapping[str, Sequence[str]],
|
|
@@ -804,11 +822,13 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
|
|
|
804
822
|
|
|
805
823
|
# PostgreSQL recursive CTE with named parameters
|
|
806
824
|
# Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
|
|
825
|
+
# UNION (not UNION ALL) deduplicates on (concept_id, depth) during recursion
|
|
826
|
+
# DISTINCT in output needed since same concept can be reached at different depths
|
|
807
827
|
query = f"""
|
|
808
828
|
WITH RECURSIVE walk(concept_id, depth) AS (
|
|
809
829
|
SELECT CAST(:concept_id AS VARCHAR), 0
|
|
810
830
|
|
|
811
|
-
UNION
|
|
831
|
+
UNION
|
|
812
832
|
|
|
813
833
|
SELECT e.child_id, w.depth + 1
|
|
814
834
|
FROM walk w
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|