norm_toolkit 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/PKG-INFO +1 -1
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/pyproject.toml +1 -1
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/constants.py +3 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/normalizer_postgres.py +56 -1
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/README.md +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_merged.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_ontology.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/build_umls.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/models.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/normalizer.py +0 -0
- {norm_toolkit-1.2.0 → norm_toolkit-1.3.0}/src/norm_toolkit/utils.py +0 -0
|
@@ -171,10 +171,13 @@ class PostgresNormalizer:
|
|
|
171
171
|
coverage_weight=coverage_weight,
|
|
172
172
|
)
|
|
173
173
|
|
|
174
|
+
# Enrich hits with concept info (pref_name, description, synonyms)
|
|
175
|
+
result = await self._enrich_hits_with_concept_info(result, prefer_ttys)
|
|
176
|
+
|
|
174
177
|
# Add synonyms column if synonyms were provided
|
|
175
178
|
if synonyms:
|
|
176
179
|
syn_list = [list(synonyms.get(s, [])) for s in strings]
|
|
177
|
-
result = result.with_columns(pl.Series("
|
|
180
|
+
result = result.with_columns(pl.Series("input_synonyms", syn_list))
|
|
178
181
|
|
|
179
182
|
return result
|
|
180
183
|
|
|
@@ -476,6 +479,58 @@ LEFT JOIN agg ON agg.Q = aq.Q;
|
|
|
476
479
|
|
|
477
480
|
return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
478
481
|
|
|
482
|
+
async def _enrich_hits_with_concept_info(
|
|
483
|
+
self,
|
|
484
|
+
result: pl.DataFrame,
|
|
485
|
+
prefer_ttys: list[str] | None,
|
|
486
|
+
) -> pl.DataFrame:
|
|
487
|
+
"""Enrich hits with pref_name, description, and synonyms from concept_info."""
|
|
488
|
+
# Collect all unique concept_ids from hits
|
|
489
|
+
all_concept_ids: set[str] = set()
|
|
490
|
+
for hits in result["hits"].to_list():
|
|
491
|
+
if hits:
|
|
492
|
+
for hit in hits:
|
|
493
|
+
if hit and "global_identifier" in hit:
|
|
494
|
+
all_concept_ids.add(hit["global_identifier"])
|
|
495
|
+
|
|
496
|
+
if not all_concept_ids:
|
|
497
|
+
# No concepts to enrich, just add empty fields
|
|
498
|
+
enriched_data = []
|
|
499
|
+
for row in result.iter_rows(named=True):
|
|
500
|
+
enriched_hits = []
|
|
501
|
+
for hit in row["hits"] or []:
|
|
502
|
+
enriched_hit = dict(hit)
|
|
503
|
+
enriched_hit["pref_name"] = None
|
|
504
|
+
enriched_hit["description"] = None
|
|
505
|
+
enriched_hit["synonyms"] = []
|
|
506
|
+
enriched_hits.append(enriched_hit)
|
|
507
|
+
enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
|
|
508
|
+
return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
509
|
+
|
|
510
|
+
# Get concept info for all concepts
|
|
511
|
+
concept_infos = await self.concept_info(list(all_concept_ids), prefer_ttys=prefer_ttys)
|
|
512
|
+
|
|
513
|
+
# Enrich each hit
|
|
514
|
+
enriched_data = []
|
|
515
|
+
for row in result.iter_rows(named=True):
|
|
516
|
+
enriched_hits = []
|
|
517
|
+
for hit in row["hits"] or []:
|
|
518
|
+
enriched_hit = dict(hit)
|
|
519
|
+
cid = hit.get("global_identifier")
|
|
520
|
+
if cid and cid in concept_infos:
|
|
521
|
+
info = concept_infos[cid]
|
|
522
|
+
enriched_hit["pref_name"] = info.preferred_name
|
|
523
|
+
enriched_hit["description"] = info.description
|
|
524
|
+
enriched_hit["synonyms"] = info.synonyms or []
|
|
525
|
+
else:
|
|
526
|
+
enriched_hit["pref_name"] = None
|
|
527
|
+
enriched_hit["description"] = None
|
|
528
|
+
enriched_hit["synonyms"] = []
|
|
529
|
+
enriched_hits.append(enriched_hit)
|
|
530
|
+
enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
|
|
531
|
+
|
|
532
|
+
return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
|
|
533
|
+
|
|
479
534
|
async def concept_info(
|
|
480
535
|
self,
|
|
481
536
|
concept_ids: Sequence[str],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|