norm_toolkit 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: norm_toolkit
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Toolkit to normalize text to UMLS / ontologies
5
5
  Author: Haydn Jones
6
6
  Author-email: Haydn Jones <haydnjonest@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.2.0"
3
+ version = "1.3.0"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -38,6 +38,9 @@ HIT_STRUCT_TYPE = pl.Struct(
38
38
  "score": pl.Int64,
39
39
  "total_score": pl.Int64,
40
40
  "match_type": pl.Utf8,
41
+ "pref_name": pl.Utf8,
42
+ "description": pl.Utf8,
43
+ "synonyms": pl.List(pl.Utf8),
41
44
  }
42
45
  )
43
46
 
@@ -171,10 +171,13 @@ class PostgresNormalizer:
171
171
  coverage_weight=coverage_weight,
172
172
  )
173
173
 
174
+ # Enrich hits with concept info (pref_name, description, synonyms)
175
+ result = await self._enrich_hits_with_concept_info(result, prefer_ttys)
176
+
174
177
  # Add synonyms column if synonyms were provided
175
178
  if synonyms:
176
179
  syn_list = [list(synonyms.get(s, [])) for s in strings]
177
- result = result.with_columns(pl.Series("synonyms", syn_list))
180
+ result = result.with_columns(pl.Series("input_synonyms", syn_list))
178
181
 
179
182
  return result
180
183
 
@@ -476,6 +479,58 @@ LEFT JOIN agg ON agg.Q = aq.Q;
476
479
 
477
480
  return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
478
481
 
482
+ async def _enrich_hits_with_concept_info(
483
+ self,
484
+ result: pl.DataFrame,
485
+ prefer_ttys: list[str] | None,
486
+ ) -> pl.DataFrame:
487
+ """Enrich hits with pref_name, description, and synonyms from concept_info."""
488
+ # Collect all unique concept_ids from hits
489
+ all_concept_ids: set[str] = set()
490
+ for hits in result["hits"].to_list():
491
+ if hits:
492
+ for hit in hits:
493
+ if hit and "global_identifier" in hit:
494
+ all_concept_ids.add(hit["global_identifier"])
495
+
496
+ if not all_concept_ids:
497
+ # No concepts to enrich, just add empty fields
498
+ enriched_data = []
499
+ for row in result.iter_rows(named=True):
500
+ enriched_hits = []
501
+ for hit in row["hits"] or []:
502
+ enriched_hit = dict(hit)
503
+ enriched_hit["pref_name"] = None
504
+ enriched_hit["description"] = None
505
+ enriched_hit["synonyms"] = []
506
+ enriched_hits.append(enriched_hit)
507
+ enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
508
+ return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
509
+
510
+ # Get concept info for all concepts
511
+ concept_infos = await self.concept_info(list(all_concept_ids), prefer_ttys=prefer_ttys)
512
+
513
+ # Enrich each hit
514
+ enriched_data = []
515
+ for row in result.iter_rows(named=True):
516
+ enriched_hits = []
517
+ for hit in row["hits"] or []:
518
+ enriched_hit = dict(hit)
519
+ cid = hit.get("global_identifier")
520
+ if cid and cid in concept_infos:
521
+ info = concept_infos[cid]
522
+ enriched_hit["pref_name"] = info.preferred_name
523
+ enriched_hit["description"] = info.description
524
+ enriched_hit["synonyms"] = info.synonyms or []
525
+ else:
526
+ enriched_hit["pref_name"] = None
527
+ enriched_hit["description"] = None
528
+ enriched_hit["synonyms"] = []
529
+ enriched_hits.append(enriched_hit)
530
+ enriched_data.append({"input_string": row["input_string"], "hits": enriched_hits})
531
+
532
+ return pl.DataFrame(enriched_data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
533
+
479
534
  async def concept_info(
480
535
  self,
481
536
  concept_ids: Sequence[str],
File without changes