norm_toolkit 1.9.0__tar.gz → 1.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/PKG-INFO +1 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/pyproject.toml +1 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_backend.py +24 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/duckdb_backend.py +23 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_base.py +92 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/normalizer_utils.py +43 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/schema.py +13 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/README.md +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_merged.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_ontology.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/build_umls.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/constants.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/models.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_cache.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_postgres.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/normalizer_utils.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v1/utils.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/build_merged.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_common.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.2}/src/norm_toolkit/v2/clickhouse_upload.py +0 -0
|
@@ -38,6 +38,7 @@ from .normalizer_utils import (
|
|
|
38
38
|
build_lookup_hit_columns,
|
|
39
39
|
build_lookup_scored_cte,
|
|
40
40
|
build_narrower_concepts_sql,
|
|
41
|
+
build_narrower_identifiers_map_sql,
|
|
41
42
|
build_ontology_filter_clauses,
|
|
42
43
|
build_query_rows,
|
|
43
44
|
build_semantic_types_sql,
|
|
@@ -542,6 +543,29 @@ class ClickHouseNormalizer(BaseNormalizer):
|
|
|
542
543
|
)
|
|
543
544
|
return [row[0] for row in _query_rows(self.client, sql)]
|
|
544
545
|
|
|
546
|
+
def _narrower_identifier_map(
|
|
547
|
+
self,
|
|
548
|
+
concept_ids: Sequence[int],
|
|
549
|
+
max_depth: int | None,
|
|
550
|
+
filter_ontologies: list[str] | None,
|
|
551
|
+
) -> pl.DataFrame:
|
|
552
|
+
schema = pl.Schema({"root_id": pl.UInt64, "gid": pl.UInt64, "identifier": pl.Utf8})
|
|
553
|
+
unique_ids = list(dict.fromkeys(concept_ids))
|
|
554
|
+
if not unique_ids:
|
|
555
|
+
return pl.DataFrame(schema=schema)
|
|
556
|
+
roots_source_sql = _values_table_sql((("concept_id", "UInt64"),), [(gid,) for gid in unique_ids])
|
|
557
|
+
sql = build_narrower_identifiers_map_sql(
|
|
558
|
+
edges_table=self._table_ref(EDGES_TABLE),
|
|
559
|
+
concepts_table=self._table_ref(CONCEPTS_TABLE),
|
|
560
|
+
roots_source_sql=roots_source_sql,
|
|
561
|
+
max_depth=max_depth,
|
|
562
|
+
ontology_in_list=_values_as_in_list(filter_ontologies) if filter_ontologies else None,
|
|
563
|
+
)
|
|
564
|
+
rows = _query_rows(self.client, sql)
|
|
565
|
+
if not rows:
|
|
566
|
+
return pl.DataFrame(schema=schema)
|
|
567
|
+
return pl.DataFrame(rows, schema=list(schema.keys()), orient="row").cast(schema)
|
|
568
|
+
|
|
545
569
|
def close(self) -> None:
|
|
546
570
|
self.client.close()
|
|
547
571
|
|
|
@@ -28,6 +28,7 @@ from .normalizer_utils import (
|
|
|
28
28
|
build_lookup_hit_columns,
|
|
29
29
|
build_lookup_scored_cte,
|
|
30
30
|
build_narrower_concepts_sql,
|
|
31
|
+
build_narrower_identifiers_map_sql,
|
|
31
32
|
build_ontology_filter_clauses,
|
|
32
33
|
build_query_rows,
|
|
33
34
|
build_semantic_types_sql,
|
|
@@ -262,6 +263,28 @@ class DuckDBNormalizer(BaseNormalizer):
|
|
|
262
263
|
)
|
|
263
264
|
return [row[0] for row in self.con.execute(sql).fetchall()]
|
|
264
265
|
|
|
266
|
+
def _narrower_identifier_map(
|
|
267
|
+
self,
|
|
268
|
+
concept_ids: Sequence[int],
|
|
269
|
+
max_depth: int | None,
|
|
270
|
+
filter_ontologies: list[str] | None,
|
|
271
|
+
) -> pl.DataFrame:
|
|
272
|
+
schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "gid": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
|
|
273
|
+
if not concept_ids:
|
|
274
|
+
return pl.DataFrame(schema=schema)
|
|
275
|
+
roots_df = pl.DataFrame(
|
|
276
|
+
{"concept_id": list(dict.fromkeys(concept_ids))}, schema={"concept_id": CONCEPT_ID_DTYPE}
|
|
277
|
+
)
|
|
278
|
+
sql = build_narrower_identifiers_map_sql(
|
|
279
|
+
edges_table=EDGES_TABLE,
|
|
280
|
+
concepts_table=CONCEPTS_TABLE,
|
|
281
|
+
roots_source_sql="SELECT concept_id FROM idroots",
|
|
282
|
+
max_depth=max_depth,
|
|
283
|
+
ontology_in_list=self._values_as_in_list(filter_ontologies) if filter_ontologies else None,
|
|
284
|
+
)
|
|
285
|
+
with self._registered_arrow_table("idroots", roots_df):
|
|
286
|
+
return self.con.execute(sql).pl().cast(schema)
|
|
287
|
+
|
|
265
288
|
def close(self) -> None:
|
|
266
289
|
"""Close the database connection."""
|
|
267
290
|
self.con.close()
|
|
@@ -16,7 +16,7 @@ from .normalizer_utils import (
|
|
|
16
16
|
build_normalized_query_map,
|
|
17
17
|
canonicalize_semantic_type_ids,
|
|
18
18
|
)
|
|
19
|
-
from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, ConceptInfo, SemanticType
|
|
19
|
+
from .schema import CONCEPT_ID_DTYPE, HIT_STRUCT_TYPE, NARROWER_STRUCT_TYPE, ConceptInfo, SemanticType
|
|
20
20
|
|
|
21
21
|
_HIT_FIELD_NAMES = tuple(HIT_STRUCT_TYPE.to_schema())
|
|
22
22
|
|
|
@@ -63,6 +63,20 @@ class BaseNormalizer(ABC):
|
|
|
63
63
|
) -> Iterable[int]:
|
|
64
64
|
"""Walk the hierarchy edges and return descendant concept IDs (excluding the root)."""
|
|
65
65
|
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def _narrower_identifier_map(
|
|
68
|
+
self,
|
|
69
|
+
concept_ids: Sequence[int],
|
|
70
|
+
max_depth: int | None,
|
|
71
|
+
filter_ontologies: list[str] | None,
|
|
72
|
+
) -> pl.DataFrame:
|
|
73
|
+
"""Walk the edges from many roots at once, resolving identifiers in-query.
|
|
74
|
+
|
|
75
|
+
Returns a frame with `root_id`, `gid`, and `identifier` columns mapping
|
|
76
|
+
each root gid to its descendants' gids and source-local identifiers
|
|
77
|
+
(excluding the root).
|
|
78
|
+
"""
|
|
79
|
+
|
|
66
80
|
@abstractmethod
|
|
67
81
|
def _concept_id_scope(self, concept_ids: Sequence[int]) -> AbstractContextManager[None]:
|
|
68
82
|
"""Create a scoped idmap table/view for concept metadata lookups."""
|
|
@@ -117,6 +131,9 @@ class BaseNormalizer(ABC):
|
|
|
117
131
|
filter_tui_descendants_of: list[str] | None = None,
|
|
118
132
|
allow_partial: bool = True,
|
|
119
133
|
enrich_hits: bool = True,
|
|
134
|
+
expand_narrower: bool = False,
|
|
135
|
+
expand_max_depth: int | None = None,
|
|
136
|
+
expand_filter_ontologies: list[str] | None = None,
|
|
120
137
|
) -> pl.DataFrame:
|
|
121
138
|
"""
|
|
122
139
|
Normalize input strings to ranked concepts.
|
|
@@ -138,6 +155,15 @@ class BaseNormalizer(ABC):
|
|
|
138
155
|
enrich_hits: Populate pref_name, description, and synonyms for each hit.
|
|
139
156
|
Disable this for faster bulk ID/rank normalization; metadata fields
|
|
140
157
|
remain null in the returned hit structs.
|
|
158
|
+
expand_narrower: For each hit, walk the edges hierarchy and populate
|
|
159
|
+
the hit's `narrower` field with the source-local identifiers of
|
|
160
|
+
its descendant concepts (across all ontologies). Left null when
|
|
161
|
+
disabled or when the database has no edges.
|
|
162
|
+
expand_max_depth: Maximum hierarchy depth to expand (1 = direct
|
|
163
|
+
children only, None = all descendants). Only used when
|
|
164
|
+
expand_narrower is True.
|
|
165
|
+
expand_filter_ontologies: Only follow edges from these ontologies when
|
|
166
|
+
expanding. None follows every ontology's edges.
|
|
141
167
|
|
|
142
168
|
Returns:
|
|
143
169
|
DataFrame with columns: input_string, hits (list of match structs),
|
|
@@ -168,6 +194,9 @@ class BaseNormalizer(ABC):
|
|
|
168
194
|
if enrich_hits:
|
|
169
195
|
result = self._enrich_hits_with_concept_info(result)
|
|
170
196
|
|
|
197
|
+
if expand_narrower and self._has_edges:
|
|
198
|
+
result = self._expand_hits_with_narrower(result, expand_max_depth, expand_filter_ontologies)
|
|
199
|
+
|
|
171
200
|
result = result.with_columns(pl.Series("input_string", strings_list))
|
|
172
201
|
if synonyms is not None:
|
|
173
202
|
result = result.with_columns(pl.Series("synonyms", syn_list, dtype=pl.List(pl.Utf8)))
|
|
@@ -241,6 +270,68 @@ class BaseNormalizer(ABC):
|
|
|
241
270
|
|
|
242
271
|
return info_df.cast(schema)
|
|
243
272
|
|
|
273
|
+
def _expand_hits_with_narrower(
|
|
274
|
+
self,
|
|
275
|
+
result: pl.DataFrame,
|
|
276
|
+
max_depth: int | None,
|
|
277
|
+
filter_ontologies: list[str] | None,
|
|
278
|
+
) -> pl.DataFrame:
|
|
279
|
+
"""Populate each hit's `narrower` field with descendant identifiers."""
|
|
280
|
+
if result.is_empty():
|
|
281
|
+
return result
|
|
282
|
+
|
|
283
|
+
base = result.with_row_index("__row_idx")
|
|
284
|
+
hit_rows = (
|
|
285
|
+
base.explode("hits")
|
|
286
|
+
.unnest("hits")
|
|
287
|
+
.filter(pl.col("gid").is_not_null())
|
|
288
|
+
.with_columns(pl.int_range(pl.len()).over("__row_idx").alias("__hit_idx"))
|
|
289
|
+
)
|
|
290
|
+
if hit_rows.is_empty():
|
|
291
|
+
return result
|
|
292
|
+
|
|
293
|
+
root_ids = hit_rows.get_column("gid").unique(maintain_order=True).to_list()
|
|
294
|
+
narrower_df = self._narrower_identifiers_frame(root_ids, max_depth, filter_ontologies)
|
|
295
|
+
expanded_hit_rows = (
|
|
296
|
+
hit_rows.drop("narrower")
|
|
297
|
+
.join(narrower_df, on="gid", how="left")
|
|
298
|
+
.with_columns(pl.col("narrower").fill_null([]).cast(pl.List(NARROWER_STRUCT_TYPE)))
|
|
299
|
+
.sort(["__row_idx", "__hit_idx"])
|
|
300
|
+
)
|
|
301
|
+
expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
|
|
302
|
+
pl.struct(_HIT_FIELD_NAMES).alias("hits")
|
|
303
|
+
)
|
|
304
|
+
return (
|
|
305
|
+
base.select("__row_idx", "input_string")
|
|
306
|
+
.join(expanded_hits, on="__row_idx", how="left")
|
|
307
|
+
.sort("__row_idx")
|
|
308
|
+
.drop("__row_idx")
|
|
309
|
+
.with_columns(pl.col("hits").fill_null([]).cast(pl.List(HIT_STRUCT_TYPE)))
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def _narrower_identifiers_frame(
|
|
313
|
+
self,
|
|
314
|
+
root_ids: Sequence[int],
|
|
315
|
+
max_depth: int | None,
|
|
316
|
+
filter_ontologies: list[str] | None,
|
|
317
|
+
) -> pl.DataFrame:
|
|
318
|
+
"""Return (gid, narrower) rows mapping each root gid to descendant concepts."""
|
|
319
|
+
empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(NARROWER_STRUCT_TYPE)})
|
|
320
|
+
if not root_ids:
|
|
321
|
+
return empty
|
|
322
|
+
|
|
323
|
+
map_df = self._narrower_identifier_map(list(root_ids), max_depth, filter_ontologies)
|
|
324
|
+
if map_df.is_empty():
|
|
325
|
+
return empty
|
|
326
|
+
|
|
327
|
+
return (
|
|
328
|
+
map_df.sort("root_id", "identifier")
|
|
329
|
+
.group_by("root_id", maintain_order=True)
|
|
330
|
+
.agg(pl.struct("gid", "identifier").alias("narrower"))
|
|
331
|
+
.rename({"root_id": "gid"})
|
|
332
|
+
.with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
|
|
333
|
+
)
|
|
334
|
+
|
|
244
335
|
def _resolve_identifiers(self, identifiers: Sequence[str]) -> dict[str, int]:
|
|
245
336
|
"""
|
|
246
337
|
Map source-local identifiers to internal gids via the concepts table.
|
|
@@ -544,3 +544,46 @@ def build_narrower_concepts_sql(
|
|
|
544
544
|
WHERE concept_id != {root_literal}
|
|
545
545
|
"""
|
|
546
546
|
).strip()
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def build_narrower_identifiers_map_sql(
|
|
550
|
+
*,
|
|
551
|
+
edges_table: str,
|
|
552
|
+
concepts_table: str,
|
|
553
|
+
roots_source_sql: str,
|
|
554
|
+
max_depth: int | None,
|
|
555
|
+
ontology_in_list: str | None,
|
|
556
|
+
) -> str:
|
|
557
|
+
"""
|
|
558
|
+
Build a recursive descendant walk seeded from many roots at once, resolving
|
|
559
|
+
descendants to their source-local identifiers in the same query.
|
|
560
|
+
|
|
561
|
+
`roots_source_sql` must be a relation yielding a `concept_id` column for the
|
|
562
|
+
starting concepts. The result has (root_id, gid, identifier) rows mapping each
|
|
563
|
+
root gid to its descendants' gids and identifiers, excluding the root itself.
|
|
564
|
+
"""
|
|
565
|
+
recurse_filters: list[str] = []
|
|
566
|
+
if max_depth is not None:
|
|
567
|
+
recurse_filters.append(f"walk.depth < {int(max_depth)}")
|
|
568
|
+
if ontology_in_list:
|
|
569
|
+
recurse_filters.append(f"e.ontology IN ({ontology_in_list})")
|
|
570
|
+
recurse_where = f"WHERE {' AND '.join(recurse_filters)}" if recurse_filters else ""
|
|
571
|
+
return dedent(
|
|
572
|
+
f"""
|
|
573
|
+
WITH RECURSIVE walk AS (
|
|
574
|
+
SELECT concept_id AS root_id, concept_id, 0 AS depth
|
|
575
|
+
FROM ({roots_source_sql}) AS roots
|
|
576
|
+
UNION ALL
|
|
577
|
+
SELECT walk.root_id, e.child_id AS concept_id, walk.depth + 1 AS depth
|
|
578
|
+
FROM walk
|
|
579
|
+
JOIN {edges_table} AS e
|
|
580
|
+
ON e.parent_id = walk.concept_id
|
|
581
|
+
{recurse_where}
|
|
582
|
+
)
|
|
583
|
+
SELECT DISTINCT walk.root_id, walk.concept_id AS gid, c.identifier
|
|
584
|
+
FROM walk
|
|
585
|
+
JOIN {concepts_table} AS c
|
|
586
|
+
ON c.concept_id = walk.concept_id
|
|
587
|
+
WHERE walk.concept_id != walk.root_id
|
|
588
|
+
"""
|
|
589
|
+
).strip()
|
|
@@ -174,9 +174,20 @@ ONTOLOGY_PREF_STT = "PF"
|
|
|
174
174
|
DEFAULT_ONTOLOGY_PREF_RANK = 3
|
|
175
175
|
DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
|
|
176
176
|
|
|
177
|
+
# Struct for a single expanded descendant concept, carrying both the internal
|
|
178
|
+
# dense gid and the source-local identifier.
|
|
179
|
+
NARROWER_STRUCT_TYPE = pl.Struct(
|
|
180
|
+
{
|
|
181
|
+
"gid": CONCEPT_ID_DTYPE,
|
|
182
|
+
"identifier": pl.Utf8,
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
|
|
177
186
|
# Polars struct type for normalized hits. The first nine fields are produced by
|
|
178
187
|
# the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
|
|
179
|
-
# description, and synonyms are filled afterwards by hit enrichment.
|
|
188
|
+
# description, and synonyms are filled afterwards by hit enrichment. `narrower`
|
|
189
|
+
# is filled only when normalize(expand_narrower=True) walks the edges hierarchy;
|
|
190
|
+
# it stays null otherwise.
|
|
180
191
|
HIT_STRUCT_TYPE = pl.Struct(
|
|
181
192
|
{
|
|
182
193
|
"gid": CONCEPT_ID_DTYPE,
|
|
@@ -191,6 +202,7 @@ HIT_STRUCT_TYPE = pl.Struct(
|
|
|
191
202
|
"pref_name": pl.Utf8,
|
|
192
203
|
"description": pl.Utf8,
|
|
193
204
|
"synonyms": pl.List(pl.Utf8),
|
|
205
|
+
"narrower": pl.List(NARROWER_STRUCT_TYPE),
|
|
194
206
|
}
|
|
195
207
|
)
|
|
196
208
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|