norm_toolkit 1.9.0__tar.gz → 1.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/PKG-INFO +1 -1
  2. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/pyproject.toml +1 -1
  3. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_backend.py +24 -0
  4. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/duckdb_backend.py +23 -0
  5. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/normalizer_base.py +89 -0
  6. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/normalizer_utils.py +43 -0
  7. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/schema.py +4 -1
  8. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/README.md +0 -0
  9. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/__init__.py +0 -0
  10. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/__init__.py +0 -0
  11. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_merged.py +0 -0
  12. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_ontology.py +0 -0
  13. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_umls.py +0 -0
  14. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/constants.py +0 -0
  15. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/models.py +0 -0
  16. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer.py +0 -0
  17. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_cache.py +0 -0
  18. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_postgres.py +0 -0
  19. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_utils.py +0 -0
  20. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/utils.py +0 -0
  21. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/__init__.py +0 -0
  22. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/build_merged.py +0 -0
  23. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_common.py +0 -0
  24. {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: norm_toolkit
3
- Version: 1.9.0
3
+ Version: 1.9.1
4
4
  Summary: Toolkit to normalize text to UMLS / ontologies
5
5
  Author: Haydn Jones
6
6
  Author-email: Haydn Jones <haydnjonest@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.9.0"
3
+ version = "1.9.1"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -38,6 +38,7 @@ from .normalizer_utils import (
38
38
  build_lookup_hit_columns,
39
39
  build_lookup_scored_cte,
40
40
  build_narrower_concepts_sql,
41
+ build_narrower_identifiers_map_sql,
41
42
  build_ontology_filter_clauses,
42
43
  build_query_rows,
43
44
  build_semantic_types_sql,
@@ -542,6 +543,29 @@ class ClickHouseNormalizer(BaseNormalizer):
542
543
  )
543
544
  return [row[0] for row in _query_rows(self.client, sql)]
544
545
 
546
+ def _narrower_identifier_map(
547
+ self,
548
+ concept_ids: Sequence[int],
549
+ max_depth: int | None,
550
+ filter_ontologies: list[str] | None,
551
+ ) -> pl.DataFrame:
552
+ schema = pl.Schema({"root_id": pl.UInt64, "identifier": pl.Utf8})
553
+ unique_ids = list(dict.fromkeys(concept_ids))
554
+ if not unique_ids:
555
+ return pl.DataFrame(schema=schema)
556
+ roots_source_sql = _values_table_sql((("concept_id", "UInt64"),), [(gid,) for gid in unique_ids])
557
+ sql = build_narrower_identifiers_map_sql(
558
+ edges_table=self._table_ref(EDGES_TABLE),
559
+ concepts_table=self._table_ref(CONCEPTS_TABLE),
560
+ roots_source_sql=roots_source_sql,
561
+ max_depth=max_depth,
562
+ ontology_in_list=_values_as_in_list(filter_ontologies) if filter_ontologies else None,
563
+ )
564
+ rows = _query_rows(self.client, sql)
565
+ if not rows:
566
+ return pl.DataFrame(schema=schema)
567
+ return pl.DataFrame(rows, schema=list(schema.keys()), orient="row").cast(schema)
568
+
545
569
  def close(self) -> None:
546
570
  self.client.close()
547
571
 
@@ -28,6 +28,7 @@ from .normalizer_utils import (
28
28
  build_lookup_hit_columns,
29
29
  build_lookup_scored_cte,
30
30
  build_narrower_concepts_sql,
31
+ build_narrower_identifiers_map_sql,
31
32
  build_ontology_filter_clauses,
32
33
  build_query_rows,
33
34
  build_semantic_types_sql,
@@ -262,6 +263,28 @@ class DuckDBNormalizer(BaseNormalizer):
262
263
  )
263
264
  return [row[0] for row in self.con.execute(sql).fetchall()]
264
265
 
266
+ def _narrower_identifier_map(
267
+ self,
268
+ concept_ids: Sequence[int],
269
+ max_depth: int | None,
270
+ filter_ontologies: list[str] | None,
271
+ ) -> pl.DataFrame:
272
+ schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
273
+ if not concept_ids:
274
+ return pl.DataFrame(schema=schema)
275
+ roots_df = pl.DataFrame(
276
+ {"concept_id": list(dict.fromkeys(concept_ids))}, schema={"concept_id": CONCEPT_ID_DTYPE}
277
+ )
278
+ sql = build_narrower_identifiers_map_sql(
279
+ edges_table=EDGES_TABLE,
280
+ concepts_table=CONCEPTS_TABLE,
281
+ roots_source_sql="SELECT concept_id FROM idroots",
282
+ max_depth=max_depth,
283
+ ontology_in_list=self._values_as_in_list(filter_ontologies) if filter_ontologies else None,
284
+ )
285
+ with self._registered_arrow_table("idroots", roots_df):
286
+ return self.con.execute(sql).pl().cast(schema)
287
+
265
288
  def close(self) -> None:
266
289
  """Close the database connection."""
267
290
  self.con.close()
@@ -63,6 +63,19 @@ class BaseNormalizer(ABC):
63
63
  ) -> Iterable[int]:
64
64
  """Walk the hierarchy edges and return descendant concept IDs (excluding the root)."""
65
65
 
66
+ @abstractmethod
67
+ def _narrower_identifier_map(
68
+ self,
69
+ concept_ids: Sequence[int],
70
+ max_depth: int | None,
71
+ filter_ontologies: list[str] | None,
72
+ ) -> pl.DataFrame:
73
+ """Walk the edges from many roots at once, resolving identifiers in-query.
74
+
75
+ Returns a frame with `root_id` and `identifier` columns mapping each root
76
+ gid to its descendants' source-local identifiers (excluding the root).
77
+ """
78
+
66
79
  @abstractmethod
67
80
  def _concept_id_scope(self, concept_ids: Sequence[int]) -> AbstractContextManager[None]:
68
81
  """Create a scoped idmap table/view for concept metadata lookups."""
@@ -117,6 +130,9 @@ class BaseNormalizer(ABC):
117
130
  filter_tui_descendants_of: list[str] | None = None,
118
131
  allow_partial: bool = True,
119
132
  enrich_hits: bool = True,
133
+ expand_narrower: bool = False,
134
+ expand_max_depth: int | None = None,
135
+ expand_filter_ontologies: list[str] | None = None,
120
136
  ) -> pl.DataFrame:
121
137
  """
122
138
  Normalize input strings to ranked concepts.
@@ -138,6 +154,15 @@ class BaseNormalizer(ABC):
138
154
  enrich_hits: Populate pref_name, description, and synonyms for each hit.
139
155
  Disable this for faster bulk ID/rank normalization; metadata fields
140
156
  remain null in the returned hit structs.
157
+ expand_narrower: For each hit, walk the edges hierarchy and populate
158
+ the hit's `narrower` field with the source-local identifiers of
159
+ its descendant concepts (across all ontologies). Left null when
160
+ disabled or when the database has no edges.
161
+ expand_max_depth: Maximum hierarchy depth to expand (1 = direct
162
+ children only, None = all descendants). Only used when
163
+ expand_narrower is True.
164
+ expand_filter_ontologies: Only follow edges from these ontologies when
165
+ expanding. None follows every ontology's edges.
141
166
 
142
167
  Returns:
143
168
  DataFrame with columns: input_string, hits (list of match structs),
@@ -168,6 +193,9 @@ class BaseNormalizer(ABC):
168
193
  if enrich_hits:
169
194
  result = self._enrich_hits_with_concept_info(result)
170
195
 
196
+ if expand_narrower and self._has_edges:
197
+ result = self._expand_hits_with_narrower(result, expand_max_depth, expand_filter_ontologies)
198
+
171
199
  result = result.with_columns(pl.Series("input_string", strings_list))
172
200
  if synonyms is not None:
173
201
  result = result.with_columns(pl.Series("synonyms", syn_list, dtype=pl.List(pl.Utf8)))
@@ -241,6 +269,67 @@ class BaseNormalizer(ABC):
241
269
 
242
270
  return info_df.cast(schema)
243
271
 
272
+ def _expand_hits_with_narrower(
273
+ self,
274
+ result: pl.DataFrame,
275
+ max_depth: int | None,
276
+ filter_ontologies: list[str] | None,
277
+ ) -> pl.DataFrame:
278
+ """Populate each hit's `narrower` field with descendant identifiers."""
279
+ if result.is_empty():
280
+ return result
281
+
282
+ base = result.with_row_index("__row_idx")
283
+ hit_rows = (
284
+ base.explode("hits")
285
+ .unnest("hits")
286
+ .filter(pl.col("gid").is_not_null())
287
+ .with_columns(pl.int_range(pl.len()).over("__row_idx").alias("__hit_idx"))
288
+ )
289
+ if hit_rows.is_empty():
290
+ return result
291
+
292
+ root_ids = hit_rows.get_column("gid").unique(maintain_order=True).to_list()
293
+ narrower_df = self._narrower_identifiers_frame(root_ids, max_depth, filter_ontologies)
294
+ expanded_hit_rows = (
295
+ hit_rows.drop("narrower")
296
+ .join(narrower_df, on="gid", how="left")
297
+ .with_columns(pl.col("narrower").fill_null([]).cast(pl.List(pl.Utf8)))
298
+ .sort(["__row_idx", "__hit_idx"])
299
+ )
300
+ expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
301
+ pl.struct(_HIT_FIELD_NAMES).alias("hits")
302
+ )
303
+ return (
304
+ base.select("__row_idx", "input_string")
305
+ .join(expanded_hits, on="__row_idx", how="left")
306
+ .sort("__row_idx")
307
+ .drop("__row_idx")
308
+ .with_columns(pl.col("hits").fill_null([]).cast(pl.List(HIT_STRUCT_TYPE)))
309
+ )
310
+
311
+ def _narrower_identifiers_frame(
312
+ self,
313
+ root_ids: Sequence[int],
314
+ max_depth: int | None,
315
+ filter_ontologies: list[str] | None,
316
+ ) -> pl.DataFrame:
317
+ """Return (gid, narrower) rows mapping each root gid to descendant identifiers."""
318
+ empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(pl.Utf8)})
319
+ if not root_ids:
320
+ return empty
321
+
322
+ map_df = self._narrower_identifier_map(list(root_ids), max_depth, filter_ontologies)
323
+ if map_df.is_empty():
324
+ return empty
325
+
326
+ return (
327
+ map_df.group_by("root_id")
328
+ .agg(pl.col("identifier").sort().alias("narrower"))
329
+ .rename({"root_id": "gid"})
330
+ .with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
331
+ )
332
+
244
333
  def _resolve_identifiers(self, identifiers: Sequence[str]) -> dict[str, int]:
245
334
  """
246
335
  Map source-local identifiers to internal gids via the concepts table.
@@ -544,3 +544,46 @@ def build_narrower_concepts_sql(
544
544
  WHERE concept_id != {root_literal}
545
545
  """
546
546
  ).strip()
547
+
548
+
549
+ def build_narrower_identifiers_map_sql(
550
+ *,
551
+ edges_table: str,
552
+ concepts_table: str,
553
+ roots_source_sql: str,
554
+ max_depth: int | None,
555
+ ontology_in_list: str | None,
556
+ ) -> str:
557
+ """
558
+ Build a recursive descendant walk seeded from many roots at once, resolving
559
+ descendants to their source-local identifiers in the same query.
560
+
561
+ `roots_source_sql` must be a relation yielding a `concept_id` column for the
562
+ starting concepts. The result has (root_id, identifier) rows mapping each
563
+ root gid to its descendants' identifiers, excluding the root itself.
564
+ """
565
+ recurse_filters: list[str] = []
566
+ if max_depth is not None:
567
+ recurse_filters.append(f"walk.depth < {int(max_depth)}")
568
+ if ontology_in_list:
569
+ recurse_filters.append(f"e.ontology IN ({ontology_in_list})")
570
+ recurse_where = f"WHERE {' AND '.join(recurse_filters)}" if recurse_filters else ""
571
+ return dedent(
572
+ f"""
573
+ WITH RECURSIVE walk AS (
574
+ SELECT concept_id AS root_id, concept_id, 0 AS depth
575
+ FROM ({roots_source_sql}) AS roots
576
+ UNION ALL
577
+ SELECT walk.root_id, e.child_id AS concept_id, walk.depth + 1 AS depth
578
+ FROM walk
579
+ JOIN {edges_table} AS e
580
+ ON e.parent_id = walk.concept_id
581
+ {recurse_where}
582
+ )
583
+ SELECT DISTINCT walk.root_id, c.identifier
584
+ FROM walk
585
+ JOIN {concepts_table} AS c
586
+ ON c.concept_id = walk.concept_id
587
+ WHERE walk.concept_id != walk.root_id
588
+ """
589
+ ).strip()
@@ -176,7 +176,9 @@ DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
176
176
 
177
177
  # Polars struct type for normalized hits. The first nine fields are produced by
178
178
  # the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
179
- # description, and synonyms are filled afterwards by hit enrichment.
179
+ # description, and synonyms are filled afterwards by hit enrichment. `narrower`
180
+ # is filled only when normalize(expand_narrower=True) walks the edges hierarchy;
181
+ # it stays null otherwise.
180
182
  HIT_STRUCT_TYPE = pl.Struct(
181
183
  {
182
184
  "gid": CONCEPT_ID_DTYPE,
@@ -191,6 +193,7 @@ HIT_STRUCT_TYPE = pl.Struct(
191
193
  "pref_name": pl.Utf8,
192
194
  "description": pl.Utf8,
193
195
  "synonyms": pl.List(pl.Utf8),
196
+ "narrower": pl.List(pl.Utf8),
194
197
  }
195
198
  )
196
199
 
File without changes