norm_toolkit 1.9.1__tar.gz → 1.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/PKG-INFO +1 -1
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/pyproject.toml +1 -1
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_backend.py +1 -1
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/duckdb_backend.py +1 -1
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_base.py +10 -8
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_utils.py +3 -3
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/schema.py +10 -1
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/README.md +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/__init__.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_merged.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_ontology.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_umls.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/constants.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/models.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_cache.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_postgres.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_utils.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/utils.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/__init__.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/build_merged.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_common.py +0 -0
- {norm_toolkit-1.9.1 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_upload.py +0 -0
|
@@ -549,7 +549,7 @@ class ClickHouseNormalizer(BaseNormalizer):
|
|
|
549
549
|
max_depth: int | None,
|
|
550
550
|
filter_ontologies: list[str] | None,
|
|
551
551
|
) -> pl.DataFrame:
|
|
552
|
-
schema = pl.Schema({"root_id": pl.UInt64, "identifier": pl.Utf8})
|
|
552
|
+
schema = pl.Schema({"root_id": pl.UInt64, "gid": pl.UInt64, "identifier": pl.Utf8})
|
|
553
553
|
unique_ids = list(dict.fromkeys(concept_ids))
|
|
554
554
|
if not unique_ids:
|
|
555
555
|
return pl.DataFrame(schema=schema)
|
|
@@ -269,7 +269,7 @@ class DuckDBNormalizer(BaseNormalizer):
|
|
|
269
269
|
max_depth: int | None,
|
|
270
270
|
filter_ontologies: list[str] | None,
|
|
271
271
|
) -> pl.DataFrame:
|
|
272
|
-
schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
|
|
272
|
+
schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "gid": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
|
|
273
273
|
if not concept_ids:
|
|
274
274
|
return pl.DataFrame(schema=schema)
|
|
275
275
|
roots_df = pl.DataFrame(
|
|
@@ -16,7 +16,7 @@ from .normalizer_utils import (
|
|
|
16
16
|
build_normalized_query_map,
|
|
17
17
|
canonicalize_semantic_type_ids,
|
|
18
18
|
)
|
|
19
|
-
from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, ConceptInfo, SemanticType
|
|
19
|
+
from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, NARROWER_STRUCT_TYPE, ConceptInfo, SemanticType
|
|
20
20
|
|
|
21
21
|
_HIT_FIELD_NAMES = tuple(HIT_STRUCT_TYPE.to_schema())
|
|
22
22
|
|
|
@@ -72,8 +72,9 @@ class BaseNormalizer(ABC):
|
|
|
72
72
|
) -> pl.DataFrame:
|
|
73
73
|
"""Walk the edges from many roots at once, resolving identifiers in-query.
|
|
74
74
|
|
|
75
|
-
Returns a frame with `root_id` and `identifier` columns mapping
|
|
76
|
-
gid to its descendants' source-local identifiers
|
|
75
|
+
Returns a frame with `root_id`, `gid`, and `identifier` columns mapping
|
|
76
|
+
each root gid to its descendants' gids and source-local identifiers
|
|
77
|
+
(excluding the root).
|
|
77
78
|
"""
|
|
78
79
|
|
|
79
80
|
@abstractmethod
|
|
@@ -294,7 +295,7 @@ class BaseNormalizer(ABC):
|
|
|
294
295
|
expanded_hit_rows = (
|
|
295
296
|
hit_rows.drop("narrower")
|
|
296
297
|
.join(narrower_df, on="gid", how="left")
|
|
297
|
-
.with_columns(pl.col("narrower").fill_null([]).cast(pl.List(
|
|
298
|
+
.with_columns(pl.col("narrower").fill_null([]).cast(pl.List(NARROWER_STRUCT_TYPE)))
|
|
298
299
|
.sort(["__row_idx", "__hit_idx"])
|
|
299
300
|
)
|
|
300
301
|
expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
|
|
@@ -314,8 +315,8 @@ class BaseNormalizer(ABC):
|
|
|
314
315
|
max_depth: int | None,
|
|
315
316
|
filter_ontologies: list[str] | None,
|
|
316
317
|
) -> pl.DataFrame:
|
|
317
|
-
"""Return (gid, narrower) rows mapping each root gid to descendant
|
|
318
|
-
empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(
|
|
318
|
+
"""Return (gid, narrower) rows mapping each root gid to descendant concepts."""
|
|
319
|
+
empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(NARROWER_STRUCT_TYPE)})
|
|
319
320
|
if not root_ids:
|
|
320
321
|
return empty
|
|
321
322
|
|
|
@@ -324,8 +325,9 @@ class BaseNormalizer(ABC):
|
|
|
324
325
|
return empty
|
|
325
326
|
|
|
326
327
|
return (
|
|
327
|
-
map_df.
|
|
328
|
-
.
|
|
328
|
+
map_df.sort("root_id", "identifier")
|
|
329
|
+
.group_by("root_id", maintain_order=True)
|
|
330
|
+
.agg(pl.struct("gid", "identifier").alias("narrower"))
|
|
329
331
|
.rename({"root_id": "gid"})
|
|
330
332
|
.with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
|
|
331
333
|
)
|
|
@@ -559,8 +559,8 @@ def build_narrower_identifiers_map_sql(
|
|
|
559
559
|
descendants to their source-local identifiers in the same query.
|
|
560
560
|
|
|
561
561
|
`roots_source_sql` must be a relation yielding a `concept_id` column for the
|
|
562
|
-
starting concepts. The result has (root_id, identifier) rows mapping each
|
|
563
|
-
root gid to its descendants' identifiers, excluding the root itself.
|
|
562
|
+
starting concepts. The result has (root_id, gid, identifier) rows mapping each
|
|
563
|
+
root gid to its descendants' gids and identifiers, excluding the root itself.
|
|
564
564
|
"""
|
|
565
565
|
recurse_filters: list[str] = []
|
|
566
566
|
if max_depth is not None:
|
|
@@ -580,7 +580,7 @@ def build_narrower_identifiers_map_sql(
|
|
|
580
580
|
ON e.parent_id = walk.concept_id
|
|
581
581
|
{recurse_where}
|
|
582
582
|
)
|
|
583
|
-
SELECT DISTINCT walk.root_id, c.identifier
|
|
583
|
+
SELECT DISTINCT walk.root_id, walk.concept_id AS gid, c.identifier
|
|
584
584
|
FROM walk
|
|
585
585
|
JOIN {concepts_table} AS c
|
|
586
586
|
ON c.concept_id = walk.concept_id
|
|
@@ -174,6 +174,15 @@ ONTOLOGY_PREF_STT = "PF"
|
|
|
174
174
|
DEFAULT_ONTOLOGY_PREF_RANK = 3
|
|
175
175
|
DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
|
|
176
176
|
|
|
177
|
+
# Struct for a single expanded descendant concept, carrying both the internal
|
|
178
|
+
# dense gid and the source-local identifier.
|
|
179
|
+
NARROWER_STRUCT_TYPE = pl.Struct(
|
|
180
|
+
{
|
|
181
|
+
"gid": CONCEPT_ID_DTYPE,
|
|
182
|
+
"identifier": pl.Utf8,
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
|
|
177
186
|
# Polars struct type for normalized hits. The first nine fields are produced by
|
|
178
187
|
# the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
|
|
179
188
|
# description, and synonyms are filled afterwards by hit enrichment. `narrower`
|
|
@@ -193,7 +202,7 @@ HIT_STRUCT_TYPE = pl.Struct(
|
|
|
193
202
|
"pref_name": pl.Utf8,
|
|
194
203
|
"description": pl.Utf8,
|
|
195
204
|
"synonyms": pl.List(pl.Utf8),
|
|
196
|
-
"narrower": pl.List(
|
|
205
|
+
"narrower": pl.List(NARROWER_STRUCT_TYPE),
|
|
197
206
|
}
|
|
198
207
|
)
|
|
199
208
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|