norm_toolkit 1.9.1__tar.gz → 1.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/PKG-INFO +1 -1
  2. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/pyproject.toml +1 -1
  3. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_backend.py +1 -1
  4. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/duckdb_backend.py +1 -1
  5. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_base.py +10 -8
  6. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_utils.py +3 -3
  7. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/schema.py +10 -1
  8. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/README.md +0 -0
  9. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/__init__.py +0 -0
  10. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/__init__.py +0 -0
  11. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_merged.py +0 -0
  12. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_ontology.py +0 -0
  13. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_umls.py +0 -0
  14. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/constants.py +0 -0
  15. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/models.py +0 -0
  16. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer.py +0 -0
  17. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_cache.py +0 -0
  18. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_postgres.py +0 -0
  19. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_utils.py +0 -0
  20. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/utils.py +0 -0
  21. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/__init__.py +0 -0
  22. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/build_merged.py +0 -0
  23. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_common.py +0 -0
  24. {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: norm_toolkit
3
- Version: 1.9.1
3
+ Version: 1.9.2
4
4
  Summary: Toolkit to normalize text to UMLS / ontologies
5
5
  Author: Haydn Jones
6
6
  Author-email: Haydn Jones <haydnjonest@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "norm_toolkit"
3
- version = "1.9.1"
3
+ version = "1.9.2"
4
4
  description = "Toolkit to normalize text to UMLS / ontologies"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -549,7 +549,7 @@ class ClickHouseNormalizer(BaseNormalizer):
549
549
  max_depth: int | None,
550
550
  filter_ontologies: list[str] | None,
551
551
  ) -> pl.DataFrame:
552
- schema = pl.Schema({"root_id": pl.UInt64, "identifier": pl.Utf8})
552
+ schema = pl.Schema({"root_id": pl.UInt64, "gid": pl.UInt64, "identifier": pl.Utf8})
553
553
  unique_ids = list(dict.fromkeys(concept_ids))
554
554
  if not unique_ids:
555
555
  return pl.DataFrame(schema=schema)
@@ -269,7 +269,7 @@ class DuckDBNormalizer(BaseNormalizer):
269
269
  max_depth: int | None,
270
270
  filter_ontologies: list[str] | None,
271
271
  ) -> pl.DataFrame:
272
- schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
272
+ schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "gid": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
273
273
  if not concept_ids:
274
274
  return pl.DataFrame(schema=schema)
275
275
  roots_df = pl.DataFrame(
@@ -16,7 +16,7 @@ from .normalizer_utils import (
16
16
  build_normalized_query_map,
17
17
  canonicalize_semantic_type_ids,
18
18
  )
19
- from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, ConceptInfo, SemanticType
19
+ from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, NARROWER_STRUCT_TYPE, ConceptInfo, SemanticType
20
20
 
21
21
  _HIT_FIELD_NAMES = tuple(HIT_STRUCT_TYPE.to_schema())
22
22
 
@@ -72,8 +72,9 @@ class BaseNormalizer(ABC):
72
72
  ) -> pl.DataFrame:
73
73
  """Walk the edges from many roots at once, resolving identifiers in-query.
74
74
 
75
- Returns a frame with `root_id` and `identifier` columns mapping each root
76
- gid to its descendants' source-local identifiers (excluding the root).
75
+ Returns a frame with `root_id`, `gid`, and `identifier` columns mapping
76
+ each root gid to its descendants' gids and source-local identifiers
77
+ (excluding the root).
77
78
  """
78
79
 
79
80
  @abstractmethod
@@ -294,7 +295,7 @@ class BaseNormalizer(ABC):
294
295
  expanded_hit_rows = (
295
296
  hit_rows.drop("narrower")
296
297
  .join(narrower_df, on="gid", how="left")
297
- .with_columns(pl.col("narrower").fill_null([]).cast(pl.List(pl.Utf8)))
298
+ .with_columns(pl.col("narrower").fill_null([]).cast(pl.List(NARROWER_STRUCT_TYPE)))
298
299
  .sort(["__row_idx", "__hit_idx"])
299
300
  )
300
301
  expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
@@ -314,8 +315,8 @@ class BaseNormalizer(ABC):
314
315
  max_depth: int | None,
315
316
  filter_ontologies: list[str] | None,
316
317
  ) -> pl.DataFrame:
317
- """Return (gid, narrower) rows mapping each root gid to descendant identifiers."""
318
- empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(pl.Utf8)})
318
+ """Return (gid, narrower) rows mapping each root gid to descendant concepts."""
319
+ empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(NARROWER_STRUCT_TYPE)})
319
320
  if not root_ids:
320
321
  return empty
321
322
 
@@ -324,8 +325,9 @@ class BaseNormalizer(ABC):
324
325
  return empty
325
326
 
326
327
  return (
327
- map_df.group_by("root_id")
328
- .agg(pl.col("identifier").sort().alias("narrower"))
328
+ map_df.sort("root_id", "identifier")
329
+ .group_by("root_id", maintain_order=True)
330
+ .agg(pl.struct("gid", "identifier").alias("narrower"))
329
331
  .rename({"root_id": "gid"})
330
332
  .with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
331
333
  )
@@ -559,8 +559,8 @@ def build_narrower_identifiers_map_sql(
559
559
  descendants to their source-local identifiers in the same query.
560
560
 
561
561
  `roots_source_sql` must be a relation yielding a `concept_id` column for the
562
- starting concepts. The result has (root_id, identifier) rows mapping each
563
- root gid to its descendants' identifiers, excluding the root itself.
562
+ starting concepts. The result has (root_id, gid, identifier) rows mapping each
563
+ root gid to its descendants' gids and identifiers, excluding the root itself.
564
564
  """
565
565
  recurse_filters: list[str] = []
566
566
  if max_depth is not None:
@@ -580,7 +580,7 @@ def build_narrower_identifiers_map_sql(
580
580
  ON e.parent_id = walk.concept_id
581
581
  {recurse_where}
582
582
  )
583
- SELECT DISTINCT walk.root_id, c.identifier
583
+ SELECT DISTINCT walk.root_id, walk.concept_id AS gid, c.identifier
584
584
  FROM walk
585
585
  JOIN {concepts_table} AS c
586
586
  ON c.concept_id = walk.concept_id
@@ -174,6 +174,15 @@ ONTOLOGY_PREF_STT = "PF"
174
174
  DEFAULT_ONTOLOGY_PREF_RANK = 3
175
175
  DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
176
176
 
177
+ # Struct for a single expanded descendant concept, carrying both the internal
178
+ # dense gid and the source-local identifier.
179
+ NARROWER_STRUCT_TYPE = pl.Struct(
180
+ {
181
+ "gid": CONCEPT_ID_DTYPE,
182
+ "identifier": pl.Utf8,
183
+ }
184
+ )
185
+
177
186
  # Polars struct type for normalized hits. The first nine fields are produced by
178
187
  # the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
179
188
  # description, and synonyms are filled afterwards by hit enrichment. `narrower`
@@ -193,7 +202,7 @@ HIT_STRUCT_TYPE = pl.Struct(
193
202
  "pref_name": pl.Utf8,
194
203
  "description": pl.Utf8,
195
204
  "synonyms": pl.List(pl.Utf8),
196
- "narrower": pl.List(pl.Utf8),
205
+ "narrower": pl.List(NARROWER_STRUCT_TYPE),
197
206
  }
198
207
  )
199
208
 
File without changes