norm_toolkit 1.9.0__tar.gz → 1.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/PKG-INFO +1 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/pyproject.toml +1 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_backend.py +24 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/duckdb_backend.py +23 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/normalizer_base.py +89 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/normalizer_utils.py +43 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/schema.py +4 -1
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/README.md +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_merged.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_ontology.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/build_umls.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/constants.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/models.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_cache.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_postgres.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/normalizer_utils.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v1/utils.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/__init__.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/build_merged.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_common.py +0 -0
- {norm_toolkit-1.9.0 → norm_toolkit-1.9.1}/src/norm_toolkit/v2/clickhouse_upload.py +0 -0
|
@@ -38,6 +38,7 @@ from .normalizer_utils import (
|
|
|
38
38
|
build_lookup_hit_columns,
|
|
39
39
|
build_lookup_scored_cte,
|
|
40
40
|
build_narrower_concepts_sql,
|
|
41
|
+
build_narrower_identifiers_map_sql,
|
|
41
42
|
build_ontology_filter_clauses,
|
|
42
43
|
build_query_rows,
|
|
43
44
|
build_semantic_types_sql,
|
|
@@ -542,6 +543,29 @@ class ClickHouseNormalizer(BaseNormalizer):
|
|
|
542
543
|
)
|
|
543
544
|
return [row[0] for row in _query_rows(self.client, sql)]
|
|
544
545
|
|
|
546
|
+
def _narrower_identifier_map(
|
|
547
|
+
self,
|
|
548
|
+
concept_ids: Sequence[int],
|
|
549
|
+
max_depth: int | None,
|
|
550
|
+
filter_ontologies: list[str] | None,
|
|
551
|
+
) -> pl.DataFrame:
|
|
552
|
+
schema = pl.Schema({"root_id": pl.UInt64, "identifier": pl.Utf8})
|
|
553
|
+
unique_ids = list(dict.fromkeys(concept_ids))
|
|
554
|
+
if not unique_ids:
|
|
555
|
+
return pl.DataFrame(schema=schema)
|
|
556
|
+
roots_source_sql = _values_table_sql((("concept_id", "UInt64"),), [(gid,) for gid in unique_ids])
|
|
557
|
+
sql = build_narrower_identifiers_map_sql(
|
|
558
|
+
edges_table=self._table_ref(EDGES_TABLE),
|
|
559
|
+
concepts_table=self._table_ref(CONCEPTS_TABLE),
|
|
560
|
+
roots_source_sql=roots_source_sql,
|
|
561
|
+
max_depth=max_depth,
|
|
562
|
+
ontology_in_list=_values_as_in_list(filter_ontologies) if filter_ontologies else None,
|
|
563
|
+
)
|
|
564
|
+
rows = _query_rows(self.client, sql)
|
|
565
|
+
if not rows:
|
|
566
|
+
return pl.DataFrame(schema=schema)
|
|
567
|
+
return pl.DataFrame(rows, schema=list(schema.keys()), orient="row").cast(schema)
|
|
568
|
+
|
|
545
569
|
def close(self) -> None:
|
|
546
570
|
self.client.close()
|
|
547
571
|
|
|
@@ -28,6 +28,7 @@ from .normalizer_utils import (
|
|
|
28
28
|
build_lookup_hit_columns,
|
|
29
29
|
build_lookup_scored_cte,
|
|
30
30
|
build_narrower_concepts_sql,
|
|
31
|
+
build_narrower_identifiers_map_sql,
|
|
31
32
|
build_ontology_filter_clauses,
|
|
32
33
|
build_query_rows,
|
|
33
34
|
build_semantic_types_sql,
|
|
@@ -262,6 +263,28 @@ class DuckDBNormalizer(BaseNormalizer):
|
|
|
262
263
|
)
|
|
263
264
|
return [row[0] for row in self.con.execute(sql).fetchall()]
|
|
264
265
|
|
|
266
|
+
def _narrower_identifier_map(
|
|
267
|
+
self,
|
|
268
|
+
concept_ids: Sequence[int],
|
|
269
|
+
max_depth: int | None,
|
|
270
|
+
filter_ontologies: list[str] | None,
|
|
271
|
+
) -> pl.DataFrame:
|
|
272
|
+
schema = pl.Schema({"root_id": CONCEPT_ID_DTYPE, "identifier": pl.Utf8})
|
|
273
|
+
if not concept_ids:
|
|
274
|
+
return pl.DataFrame(schema=schema)
|
|
275
|
+
roots_df = pl.DataFrame(
|
|
276
|
+
{"concept_id": list(dict.fromkeys(concept_ids))}, schema={"concept_id": CONCEPT_ID_DTYPE}
|
|
277
|
+
)
|
|
278
|
+
sql = build_narrower_identifiers_map_sql(
|
|
279
|
+
edges_table=EDGES_TABLE,
|
|
280
|
+
concepts_table=CONCEPTS_TABLE,
|
|
281
|
+
roots_source_sql="SELECT concept_id FROM idroots",
|
|
282
|
+
max_depth=max_depth,
|
|
283
|
+
ontology_in_list=self._values_as_in_list(filter_ontologies) if filter_ontologies else None,
|
|
284
|
+
)
|
|
285
|
+
with self._registered_arrow_table("idroots", roots_df):
|
|
286
|
+
return self.con.execute(sql).pl().cast(schema)
|
|
287
|
+
|
|
265
288
|
def close(self) -> None:
|
|
266
289
|
"""Close the database connection."""
|
|
267
290
|
self.con.close()
|
|
@@ -63,6 +63,19 @@ class BaseNormalizer(ABC):
|
|
|
63
63
|
) -> Iterable[int]:
|
|
64
64
|
"""Walk the hierarchy edges and return descendant concept IDs (excluding the root)."""
|
|
65
65
|
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def _narrower_identifier_map(
|
|
68
|
+
self,
|
|
69
|
+
concept_ids: Sequence[int],
|
|
70
|
+
max_depth: int | None,
|
|
71
|
+
filter_ontologies: list[str] | None,
|
|
72
|
+
) -> pl.DataFrame:
|
|
73
|
+
"""Walk the edges from many roots at once, resolving identifiers in-query.
|
|
74
|
+
|
|
75
|
+
Returns a frame with `root_id` and `identifier` columns mapping each root
|
|
76
|
+
gid to its descendants' source-local identifiers (excluding the root).
|
|
77
|
+
"""
|
|
78
|
+
|
|
66
79
|
@abstractmethod
|
|
67
80
|
def _concept_id_scope(self, concept_ids: Sequence[int]) -> AbstractContextManager[None]:
|
|
68
81
|
"""Create a scoped idmap table/view for concept metadata lookups."""
|
|
@@ -117,6 +130,9 @@ class BaseNormalizer(ABC):
|
|
|
117
130
|
filter_tui_descendants_of: list[str] | None = None,
|
|
118
131
|
allow_partial: bool = True,
|
|
119
132
|
enrich_hits: bool = True,
|
|
133
|
+
expand_narrower: bool = False,
|
|
134
|
+
expand_max_depth: int | None = None,
|
|
135
|
+
expand_filter_ontologies: list[str] | None = None,
|
|
120
136
|
) -> pl.DataFrame:
|
|
121
137
|
"""
|
|
122
138
|
Normalize input strings to ranked concepts.
|
|
@@ -138,6 +154,15 @@ class BaseNormalizer(ABC):
|
|
|
138
154
|
enrich_hits: Populate pref_name, description, and synonyms for each hit.
|
|
139
155
|
Disable this for faster bulk ID/rank normalization; metadata fields
|
|
140
156
|
remain null in the returned hit structs.
|
|
157
|
+
expand_narrower: For each hit, walk the edges hierarchy and populate
|
|
158
|
+
the hit's `narrower` field with the source-local identifiers of
|
|
159
|
+
its descendant concepts (across all ontologies). Left null when
|
|
160
|
+
disabled or when the database has no edges.
|
|
161
|
+
expand_max_depth: Maximum hierarchy depth to expand (1 = direct
|
|
162
|
+
children only, None = all descendants). Only used when
|
|
163
|
+
expand_narrower is True.
|
|
164
|
+
expand_filter_ontologies: Only follow edges from these ontologies when
|
|
165
|
+
expanding. None follows every ontology's edges.
|
|
141
166
|
|
|
142
167
|
Returns:
|
|
143
168
|
DataFrame with columns: input_string, hits (list of match structs),
|
|
@@ -168,6 +193,9 @@ class BaseNormalizer(ABC):
|
|
|
168
193
|
if enrich_hits:
|
|
169
194
|
result = self._enrich_hits_with_concept_info(result)
|
|
170
195
|
|
|
196
|
+
if expand_narrower and self._has_edges:
|
|
197
|
+
result = self._expand_hits_with_narrower(result, expand_max_depth, expand_filter_ontologies)
|
|
198
|
+
|
|
171
199
|
result = result.with_columns(pl.Series("input_string", strings_list))
|
|
172
200
|
if synonyms is not None:
|
|
173
201
|
result = result.with_columns(pl.Series("synonyms", syn_list, dtype=pl.List(pl.Utf8)))
|
|
@@ -241,6 +269,67 @@ class BaseNormalizer(ABC):
|
|
|
241
269
|
|
|
242
270
|
return info_df.cast(schema)
|
|
243
271
|
|
|
272
|
+
def _expand_hits_with_narrower(
|
|
273
|
+
self,
|
|
274
|
+
result: pl.DataFrame,
|
|
275
|
+
max_depth: int | None,
|
|
276
|
+
filter_ontologies: list[str] | None,
|
|
277
|
+
) -> pl.DataFrame:
|
|
278
|
+
"""Populate each hit's `narrower` field with descendant identifiers."""
|
|
279
|
+
if result.is_empty():
|
|
280
|
+
return result
|
|
281
|
+
|
|
282
|
+
base = result.with_row_index("__row_idx")
|
|
283
|
+
hit_rows = (
|
|
284
|
+
base.explode("hits")
|
|
285
|
+
.unnest("hits")
|
|
286
|
+
.filter(pl.col("gid").is_not_null())
|
|
287
|
+
.with_columns(pl.int_range(pl.len()).over("__row_idx").alias("__hit_idx"))
|
|
288
|
+
)
|
|
289
|
+
if hit_rows.is_empty():
|
|
290
|
+
return result
|
|
291
|
+
|
|
292
|
+
root_ids = hit_rows.get_column("gid").unique(maintain_order=True).to_list()
|
|
293
|
+
narrower_df = self._narrower_identifiers_frame(root_ids, max_depth, filter_ontologies)
|
|
294
|
+
expanded_hit_rows = (
|
|
295
|
+
hit_rows.drop("narrower")
|
|
296
|
+
.join(narrower_df, on="gid", how="left")
|
|
297
|
+
.with_columns(pl.col("narrower").fill_null([]).cast(pl.List(pl.Utf8)))
|
|
298
|
+
.sort(["__row_idx", "__hit_idx"])
|
|
299
|
+
)
|
|
300
|
+
expanded_hits = expanded_hit_rows.group_by("__row_idx", maintain_order=True).agg(
|
|
301
|
+
pl.struct(_HIT_FIELD_NAMES).alias("hits")
|
|
302
|
+
)
|
|
303
|
+
return (
|
|
304
|
+
base.select("__row_idx", "input_string")
|
|
305
|
+
.join(expanded_hits, on="__row_idx", how="left")
|
|
306
|
+
.sort("__row_idx")
|
|
307
|
+
.drop("__row_idx")
|
|
308
|
+
.with_columns(pl.col("hits").fill_null([]).cast(pl.List(HIT_STRUCT_TYPE)))
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def _narrower_identifiers_frame(
|
|
312
|
+
self,
|
|
313
|
+
root_ids: Sequence[int],
|
|
314
|
+
max_depth: int | None,
|
|
315
|
+
filter_ontologies: list[str] | None,
|
|
316
|
+
) -> pl.DataFrame:
|
|
317
|
+
"""Return (gid, narrower) rows mapping each root gid to descendant identifiers."""
|
|
318
|
+
empty = pl.DataFrame(schema={"gid": CONCEPT_ID_DTYPE, "narrower": pl.List(pl.Utf8)})
|
|
319
|
+
if not root_ids:
|
|
320
|
+
return empty
|
|
321
|
+
|
|
322
|
+
map_df = self._narrower_identifier_map(list(root_ids), max_depth, filter_ontologies)
|
|
323
|
+
if map_df.is_empty():
|
|
324
|
+
return empty
|
|
325
|
+
|
|
326
|
+
return (
|
|
327
|
+
map_df.group_by("root_id")
|
|
328
|
+
.agg(pl.col("identifier").sort().alias("narrower"))
|
|
329
|
+
.rename({"root_id": "gid"})
|
|
330
|
+
.with_columns(pl.col("gid").cast(CONCEPT_ID_DTYPE))
|
|
331
|
+
)
|
|
332
|
+
|
|
244
333
|
def _resolve_identifiers(self, identifiers: Sequence[str]) -> dict[str, int]:
|
|
245
334
|
"""
|
|
246
335
|
Map source-local identifiers to internal gids via the concepts table.
|
|
@@ -544,3 +544,46 @@ def build_narrower_concepts_sql(
|
|
|
544
544
|
WHERE concept_id != {root_literal}
|
|
545
545
|
"""
|
|
546
546
|
).strip()
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def build_narrower_identifiers_map_sql(
|
|
550
|
+
*,
|
|
551
|
+
edges_table: str,
|
|
552
|
+
concepts_table: str,
|
|
553
|
+
roots_source_sql: str,
|
|
554
|
+
max_depth: int | None,
|
|
555
|
+
ontology_in_list: str | None,
|
|
556
|
+
) -> str:
|
|
557
|
+
"""
|
|
558
|
+
Build a recursive descendant walk seeded from many roots at once, resolving
|
|
559
|
+
descendants to their source-local identifiers in the same query.
|
|
560
|
+
|
|
561
|
+
`roots_source_sql` must be a relation yielding a `concept_id` column for the
|
|
562
|
+
starting concepts. The result has (root_id, identifier) rows mapping each
|
|
563
|
+
root gid to its descendants' identifiers, excluding the root itself.
|
|
564
|
+
"""
|
|
565
|
+
recurse_filters: list[str] = []
|
|
566
|
+
if max_depth is not None:
|
|
567
|
+
recurse_filters.append(f"walk.depth < {int(max_depth)}")
|
|
568
|
+
if ontology_in_list:
|
|
569
|
+
recurse_filters.append(f"e.ontology IN ({ontology_in_list})")
|
|
570
|
+
recurse_where = f"WHERE {' AND '.join(recurse_filters)}" if recurse_filters else ""
|
|
571
|
+
return dedent(
|
|
572
|
+
f"""
|
|
573
|
+
WITH RECURSIVE walk AS (
|
|
574
|
+
SELECT concept_id AS root_id, concept_id, 0 AS depth
|
|
575
|
+
FROM ({roots_source_sql}) AS roots
|
|
576
|
+
UNION ALL
|
|
577
|
+
SELECT walk.root_id, e.child_id AS concept_id, walk.depth + 1 AS depth
|
|
578
|
+
FROM walk
|
|
579
|
+
JOIN {edges_table} AS e
|
|
580
|
+
ON e.parent_id = walk.concept_id
|
|
581
|
+
{recurse_where}
|
|
582
|
+
)
|
|
583
|
+
SELECT DISTINCT walk.root_id, c.identifier
|
|
584
|
+
FROM walk
|
|
585
|
+
JOIN {concepts_table} AS c
|
|
586
|
+
ON c.concept_id = walk.concept_id
|
|
587
|
+
WHERE walk.concept_id != walk.root_id
|
|
588
|
+
"""
|
|
589
|
+
).strip()
|
|
@@ -176,7 +176,9 @@ DEFAULT_ONTOLOGY_SYNONYM_RANK = 1
|
|
|
176
176
|
|
|
177
177
|
# Polars struct type for normalized hits. The first nine fields are produced by
|
|
178
178
|
# the lookup query (see LOOKUP_HIT_FIELDS in normalizer_utils); pref_name,
|
|
179
|
-
# description, and synonyms are filled afterwards by hit enrichment.
|
|
179
|
+
# description, and synonyms are filled afterwards by hit enrichment. `narrower`
|
|
180
|
+
# is filled only when normalize(expand_narrower=True) walks the edges hierarchy;
|
|
181
|
+
# it stays null otherwise.
|
|
180
182
|
HIT_STRUCT_TYPE = pl.Struct(
|
|
181
183
|
{
|
|
182
184
|
"gid": CONCEPT_ID_DTYPE,
|
|
@@ -191,6 +193,7 @@ HIT_STRUCT_TYPE = pl.Struct(
|
|
|
191
193
|
"pref_name": pl.Utf8,
|
|
192
194
|
"description": pl.Utf8,
|
|
193
195
|
"synonyms": pl.List(pl.Utf8),
|
|
196
|
+
"narrower": pl.List(pl.Utf8),
|
|
194
197
|
}
|
|
195
198
|
)
|
|
196
199
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|