PyPI - norm_toolkit - Versions diffs - 1.0.2__tar.gz → 1.1.0__tar.gz - Mend

norm_toolkit 1.0.2tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: norm_toolkit
-Version: 1.0.2
+Version: 1.1.0
 Summary: Toolkit to normalize text to UMLS / ontologies
 Author: Haydn Jones
 Author-email: Haydn Jones <haydnjonest@gmail.com>
@@ -10,6 +10,7 @@ Requires-Dist: lvg-norm>=1.1.0
 Requires-Dist: polars[rt64]>=1.36.1
 Requires-Dist: pyarrow>=20.0.0
 Requires-Dist: pydantic>=2.12.5
+Requires-Dist: sqlalchemy>=2.0.0
 Requires-Dist: tqdm>=4.67.1
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "norm_toolkit"
-version = "1.0.2"
+version = "1.1.0"
 description = "Toolkit to normalize text to UMLS / ontologies"
 readme = "README.md"
 authors = [{ name = "Haydn Jones", email = "haydnjonest@gmail.com" }]
@@ -12,6 +12,7 @@ dependencies = [
     "polars[rt64]>=1.36.1",
     "pyarrow>=20.0.0",
     "pydantic>=2.12.5",
+    "sqlalchemy>=2.0.0",
     "tqdm>=4.67.1",
 ]

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer_postgres.py RENAMED Viewed

@@ -7,18 +7,17 @@ built by build_umls_duckdb, build_ontology_duckdb, or build_merged_duckdb.
 from __future__ import annotations
-import asyncio
 import json
 from collections.abc import Mapping, Sequence
 from typing import Any
-import asyncpg
 import polars as pl
 from lvg_norm import lvg_normalize
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncEngine
 from norm_toolkit.constants import (
     ATOMS_TABLE,
-    CONCEPTS_TABLE,
     DEFAULT_PREFER_TTYS,
     DEFS_TABLE,
     EDGES_TABLE,
@@ -37,7 +36,7 @@ from norm_toolkit.models import ConceptInfo, SemanticType
 class PostgresNormalizer:
     """
-    Async normalizer using PostgreSQL via asyncpg.
+    Async normalizer using PostgreSQL via SQLAlchemy.
     Optimized for small batch processing (1-5 strings at a time).
     Uses VALUES clauses instead of temp tables for efficiency with small batches.
@@ -45,15 +44,15 @@ class PostgresNormalizer:
     def __init__(
         self,
-        pool: asyncpg.Pool,
+        engine: AsyncEngine,
         schema: str = "public",
         owned_resource: Any | None = None,
     ) -> None:
         """
-        Initialize the normalizer with an external connection pool.
+        Initialize the normalizer with an SQLAlchemy AsyncEngine.
         Args:
-            pool: asyncpg connection pool (caller manages lifecycle)
+            engine: SQLAlchemy AsyncEngine (caller manages lifecycle)
             schema: PostgreSQL schema where tables are located (default: "public")
             owned_resource: Optional resource with async close() method to clean up
                 when this normalizer is closed (e.g., AlloyDB AsyncConnector)
@@ -62,9 +61,8 @@ class PostgresNormalizer:
             After creating the normalizer, call `await normalizer.initialize()`
             to detect database capabilities before using other methods.
         """
-        self._pool = pool
+        self._engine = engine
         self._schema = schema
-        self._loop: asyncio.AbstractEventLoop | None = None
         self._owned_resource = owned_resource
         self._has_types = False
         self._has_defs = False
@@ -77,48 +75,14 @@ class PostgresNormalizer:
         self._ns_table = f"{prefix}{NS_TABLE}"
         self._nw_table = f"{prefix}{NW_TABLE}"
         self._atoms_table = f"{prefix}{ATOMS_TABLE}"
-        self._concepts_table = f"{prefix}{CONCEPTS_TABLE}"
         self._types_table = f"{prefix}{TYPES_TABLE}"
         self._defs_table = f"{prefix}{DEFS_TABLE}"
         self._edges_table = f"{prefix}{EDGES_TABLE}"
-    @classmethod
-    def create_sync(cls, dsn: str, schema: str = "public", min_size: int = 1, max_size: int = 10) -> PostgresNormalizer:
-        """
-        Create a normalizer synchronously with its own event loop.
-        Use this factory method for sync-only usage. The normalizer will manage
-        its own event loop and pool, allowing you to call normalize_sync().
-        Args:
-            dsn: PostgreSQL connection string (e.g., "postgresql://user:pass@host:5432/db")
-            schema: PostgreSQL schema where tables are located (default: "public")
-            min_size: Minimum pool connections
-            max_size: Maximum pool connections
-        Example:
-            >>> normalizer = PostgresNormalizer.create_sync("postgresql://...")
-            >>> result = normalizer.normalize_sync(["diabetes"])
-            >>> normalizer.close_sync()
-        """
-        loop = asyncio.new_event_loop()
-        async def _create():
-            pool = await asyncpg.create_pool(dsn, min_size=min_size, max_size=max_size)
-            return pool
-        pool = loop.run_until_complete(_create())
-        instance = cls(pool, schema=schema)
-        instance._loop = loop
-        loop.run_until_complete(instance.initialize())
-        return instance
-    async def initialize(self) -> None:
-        """
-        Detect database capabilities.
-        Must be called after __init__ before using normalize/concept_info methods.
-        """
+    async def _ensure_initialized(self) -> None:
+        """Lazily initialize on first use."""
+        if self._initialized:
+            return
         self._has_types = await self._table_has_rows(self._types_table)
         self._has_defs = await self._table_has_rows(self._defs_table)
         self._has_edges = await self._table_has_rows(self._edges_table)
@@ -128,18 +92,18 @@ class PostgresNormalizer:
     async def _table_has_rows(self, table: str) -> bool:
         """Check if a table exists and has rows."""
         try:
-            async with self._pool.acquire() as con:
-                result = await con.fetchval(f"SELECT 1 FROM {table} LIMIT 1")
-                return result is not None
+            async with self._engine.connect() as conn:
+                result = await conn.execute(text(f"SELECT 1 FROM {table} LIMIT 1"))
+                return result.scalar() is not None
         except Exception:
             return False
     async def _column_has_values(self, table: str, column: str) -> bool:
         """Check if a column has any non-null values."""
         try:
-            async with self._pool.acquire() as con:
-                result = await con.fetchval(f"SELECT 1 FROM {table} WHERE {column} IS NOT NULL LIMIT 1")
-                return result is not None
+            async with self._engine.connect() as conn:
+                result = await conn.execute(text(f"SELECT 1 FROM {table} WHERE {column} IS NOT NULL LIMIT 1"))
+                return result.scalar() is not None
         except Exception:
             return False
@@ -172,6 +136,8 @@ class PostgresNormalizer:
         Returns:
             DataFrame with columns: input_string, hits (list of match structs)
         """
+        await self._ensure_initialized()
         if prefer_ttys is None:
             prefer_ttys = DEFAULT_PREFER_TTYS
@@ -223,15 +189,18 @@ class PostgresNormalizer:
                 {"hits": pl.List(HIT_STRUCT_TYPE)}
             )
-        # Build parameters and VALUES clauses
-        params: list[str] = []
+        # Build parameters and VALUES clauses using named parameters
+        params: dict[str, Any] = {}
+        param_idx = 0
         # qmap VALUES clause
         qmap_placeholders = []
         for q, nstr in qmap_rows:
-            idx = len(params)
-            params.extend([q, nstr])
-            qmap_placeholders.append(f"(${idx + 1}, ${idx + 2})")
+            q_key, nstr_key = f"p{param_idx}", f"p{param_idx + 1}"
+            params[q_key] = q
+            params[nstr_key] = nstr
+            qmap_placeholders.append(f"(:{q_key}, :{nstr_key})")
+            param_idx += 2
         qmap_values = ", ".join(qmap_placeholders)
         # qwords VALUES clause (for partial path)
@@ -240,36 +209,58 @@ class PostgresNormalizer:
             qwords_rows = [(q, n, w) for q, n in qmap_rows for w in dict.fromkeys(n.split()) if w]
             qwords_placeholders = []
             for q, nstr, nwd in qwords_rows:
-                idx = len(params)
-                params.extend([q, nstr, nwd])
-                qwords_placeholders.append(f"(${idx + 1}, ${idx + 2}, ${idx + 3})")
+                q_key, nstr_key, nwd_key = f"p{param_idx}", f"p{param_idx + 1}", f"p{param_idx + 2}"
+                params[q_key] = q
+                params[nstr_key] = nstr
+                params[nwd_key] = nwd
+                qwords_placeholders.append(f"(:{q_key}, :{nstr_key}, :{nwd_key})")
+                param_idx += 3
             qwords_values = ", ".join(qwords_placeholders)
         # allq VALUES clause (preserve order)
         allq_placeholders = []
         for q in all_queries:
-            idx = len(params)
-            params.append(q)
-            allq_placeholders.append(f"(${idx + 1})")
+            q_key = f"p{param_idx}"
+            params[q_key] = q
+            allq_placeholders.append(f"(:{q_key})")
+            param_idx += 1
         allq_values = ", ".join(allq_placeholders)
-        # Build preference clauses
+        # Build preference clauses (parameterized to prevent SQL injection)
         tty_join = ""
         tty_bump_expr = "0"
         if prefer_ttys:
-            tty_vals = ", ".join(f"('{t}')" for t in prefer_ttys)
+            tty_placeholders = []
+            for tty in prefer_ttys:
+                key = f"p{param_idx}"
+                params[key] = tty
+                tty_placeholders.append(f"(:{key})")
+                param_idx += 1
+            tty_vals = ", ".join(tty_placeholders)
             tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
             tty_bump_expr = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
-        # Source filtering
+        # Source filtering (parameterized to prevent SQL injection)
         source_filter_exprs = []
         nw_filter_clauses = []
         if filter_sources:
-            filt_vals = ", ".join(f"'{src}'" for src in filter_sources)
+            filt_placeholders = []
+            for src in filter_sources:
+                key = f"p{param_idx}"
+                params[key] = src
+                filt_placeholders.append(f":{key}")
+                param_idx += 1
+            filt_vals = ", ".join(filt_placeholders)
             source_filter_exprs.append(f"a.source IN ({filt_vals})")
             nw_filter_clauses.append(f"nw.source IN ({filt_vals})")
         if exclude_sources:
-            excl_vals = ", ".join(f"'{src}'" for src in exclude_sources)
+            excl_placeholders = []
+            for src in exclude_sources:
+                key = f"p{param_idx}"
+                params[key] = src
+                excl_placeholders.append(f":{key}")
+                param_idx += 1
+            excl_vals = ", ".join(excl_placeholders)
             source_filter_exprs.append(f"a.source NOT IN ({excl_vals})")
             nw_filter_clauses.append(f"nw.source NOT IN ({excl_vals})")
         nw_filter_clause = (" AND " + " AND ".join(nw_filter_clauses)) if nw_filter_clauses else ""
@@ -447,15 +438,22 @@ FROM allq aq
 LEFT JOIN agg ON agg.Q = aq.Q;
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(sql, *params)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(sql), params)
+            rows = result.mappings().all()
-        # Parse JSON results into Polars DataFrame
+        # Parse results into Polars DataFrame
+        # Note: asyncpg auto-deserializes JSON, so hits may already be a list
         data = []
         for row in rows:
             input_string = row["input_string"]
-            hits_json = row["hits"]
-            hits = json.loads(hits_json) if hits_json else []
+            hits_raw = row["hits"]
+            if hits_raw is None:
+                hits = []
+            elif isinstance(hits_raw, list):
+                hits = hits_raw  # Already deserialized by asyncpg
+            else:
+                hits = json.loads(hits_raw)  # String, needs parsing
             data.append({"input_string": input_string, "hits": hits})
         return pl.DataFrame(data).cast({"hits": pl.List(HIT_STRUCT_TYPE)})
@@ -477,6 +475,8 @@ LEFT JOIN agg ON agg.Q = aq.Q;
         Returns:
             Dict mapping concept_id to ConceptInfo
         """
+        await self._ensure_initialized()
         if not concept_ids:
             return {}
@@ -500,20 +500,28 @@ LEFT JOIN agg ON agg.Q = aq.Q;
                 semantic_types=[],
             )
-        # Build idmap VALUES clause
-        params: list[str] = []
+        # Build idmap VALUES clause using named parameters
+        params: dict[str, Any] = {}
+        param_idx = 0
         idmap_placeholders = []
         for cid in id_list:
-            idx = len(params)
-            params.append(cid)
-            idmap_placeholders.append(f"(${idx + 1})")
+            key = f"p{param_idx}"
+            params[key] = cid
+            idmap_placeholders.append(f"(:{key})")
+            param_idx += 1
         idmap_values = ", ".join(idmap_placeholders)
         # Build preference clauses
         tty_join = ""
         tty_bump = "0"
         if prefer_ttys:
-            tty_vals = ", ".join(f"('{t}')" for t in prefer_ttys)
+            tty_placeholders = []
+            for tty in prefer_ttys:
+                key = f"p{param_idx}"
+                params[key] = tty
+                tty_placeholders.append(f"(:{key})")
+                param_idx += 1
+            tty_vals = ", ".join(tty_placeholders)
             tty_join = f"LEFT JOIN (VALUES {tty_vals}) AS pt(tty) ON a.name_type = pt.tty"
             tty_bump = "CASE WHEN pt.tty IS NULL THEN 0 ELSE 1 END"
@@ -591,8 +599,9 @@ LEFT JOIN syn_agg  sa ON sa.concept_id = c.concept_id
 ORDER BY c.concept_id;
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(sql, *params)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(sql), params)
+            rows = result.mappings().all()
         for row in rows:
             cid = row["concept_id"]
@@ -625,18 +634,26 @@ ORDER BY c.concept_id;
         prefer_def_sources: list[str] | None,
     ) -> None:
         """Populate definitions for concepts."""
-        params: list[str] = []
+        params: dict[str, Any] = {}
+        param_idx = 0
         idmap_placeholders = []
         for cid in id_list:
-            idx = len(params)
-            params.append(cid)
-            idmap_placeholders.append(f"(${idx + 1})")
+            key = f"p{param_idx}"
+            params[key] = cid
+            idmap_placeholders.append(f"(:{key})")
+            param_idx += 1
         idmap_values = ", ".join(idmap_placeholders)
         def_pref_join = ""
         def_pref_bump = "0"
         if prefer_def_sources:
-            def_vals = ", ".join(f"('{src}')" for src in prefer_def_sources)
+            def_placeholders = []
+            for src in prefer_def_sources:
+                key = f"p{param_idx}"
+                params[key] = src
+                def_placeholders.append(f"(:{key})")
+                param_idx += 1
+            def_vals = ", ".join(def_placeholders)
             def_pref_join = f"LEFT JOIN (VALUES {def_vals}) AS pds(sab) ON d.source = pds.sab"
             def_pref_bump = "CASE WHEN pds.sab IS NULL THEN 0 ELSE 1 END"
@@ -665,8 +682,9 @@ FROM def_best
 WHERE drn = 1;
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(sql, *params)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(sql), params)
+            rows = result.mappings().all()
         for row in rows:
             cid = row["concept_id"]
@@ -680,12 +698,12 @@ WHERE drn = 1;
         id_list: list[str],
     ) -> None:
         """Populate semantic types for concepts."""
-        params: list[str] = []
+        params: dict[str, Any] = {}
         idmap_placeholders = []
-        for cid in id_list:
-            idx = len(params)
-            params.append(cid)
-            idmap_placeholders.append(f"(${idx + 1})")
+        for i, cid in enumerate(id_list):
+            key = f"p{i}"
+            params[key] = cid
+            idmap_placeholders.append(f"(:{key})")
         idmap_values = ", ".join(idmap_placeholders)
         sql = f"""
@@ -696,8 +714,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
 ORDER BY t.concept_id, t.type_tree, t.type_id;
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(sql, *params)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(sql), params)
+            rows = result.mappings().all()
         for row in rows:
             cid = row["concept_id"]
@@ -713,17 +732,19 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
         Returns dict mapping concept_id to list of {"tui": ..., "sty": ...}
         """
+        await self._ensure_initialized()
         if not self._has_types or not concept_ids:
             return {cid: [] for cid in concept_ids}
         id_list = list(dict.fromkeys(concept_ids))
-        params: list[str] = []
+        params: dict[str, Any] = {}
         idmap_placeholders = []
-        for cid in id_list:
-            idx = len(params)
-            params.append(cid)
-            idmap_placeholders.append(f"(${idx + 1})")
+        for i, cid in enumerate(id_list):
+            key = f"p{i}"
+            params[key] = cid
+            idmap_placeholders.append(f"(:{key})")
         idmap_values = ", ".join(idmap_placeholders)
         sql = f"""
@@ -734,8 +755,9 @@ JOIN idmap c ON c.concept_id = t.concept_id
 ORDER BY t.concept_id, t.type_tree, t.type_id;
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(sql, *params)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(sql), params)
+            rows = result.mappings().all()
         res: dict[str, list[dict[str, str]]] = {cid: [] for cid in id_list}
         for row in rows:
@@ -762,90 +784,55 @@ ORDER BY t.concept_id, t.type_tree, t.type_id;
         Returns:
             List of descendant concept IDs (excludes the starting concept)
         """
+        await self._ensure_initialized()
         if not self._has_edges:
             return []
+        params: dict[str, Any] = {"concept_id": concept_id, "max_depth": max_depth}
         # Build source filter clause
         source_filter = ""
         if filter_sources:
-            sources_sql = ", ".join(f"'{src}'" for src in filter_sources)
+            src_placeholders = []
+            for i, src in enumerate(filter_sources):
+                key = f"src{i}"
+                params[key] = src
+                src_placeholders.append(f":{key}")
+            sources_sql = ", ".join(src_placeholders)
             source_filter = f" AND e.source IN ({sources_sql})"
-        # PostgreSQL recursive CTE
+        # PostgreSQL recursive CTE with named parameters
+        # Use CAST() instead of :: to avoid conflicts with SQLAlchemy named params
         query = f"""
 WITH RECURSIVE walk(concept_id, depth) AS (
-    SELECT $1::VARCHAR, 0
+    SELECT CAST(:concept_id AS VARCHAR), 0
     UNION ALL
     SELECT e.child_id, w.depth + 1
     FROM walk w
     JOIN {self._edges_table} e ON e.parent_id = w.concept_id
-    WHERE ($2::INTEGER IS NULL OR w.depth < $2){source_filter}
+    WHERE (CAST(:max_depth AS INTEGER) IS NULL OR w.depth < :max_depth){source_filter}
 )
 SELECT DISTINCT concept_id
 FROM walk
-WHERE concept_id != $1
+WHERE concept_id != :concept_id
 """
-        async with self._pool.acquire() as con:
-            rows = await con.fetch(query, concept_id, max_depth)
+        async with self._engine.connect() as conn:
+            result = await conn.execute(text(query), params)
+            rows = result.mappings().all()
         return [r["concept_id"] for r in rows]
-    def normalize_sync(
-        self,
-        strings: Sequence[str],
-        top_k: int = 25,
-        prefer_ttys: list[str] | None = None,
-        filter_sources: list[str] | None = None,
-        exclude_sources: list[str] | None = None,
-        allow_partial: bool = True,
-        min_coverage: float = 0.6,
-        min_word_hits: int | None = None,
-        coverage_weight: int = 25,
-    ) -> pl.DataFrame:
-        """
-        Synchronous wrapper around normalize().
-        Requires the normalizer to be created with create_sync() factory method.
-        """
-        if self._loop is None:
-            raise RuntimeError("normalize_sync() requires normalizer created with create_sync()")
-        return self._loop.run_until_complete(
-            self.normalize(
-                strings=strings,
-                top_k=top_k,
-                prefer_ttys=prefer_ttys,
-                filter_sources=filter_sources,
-                exclude_sources=exclude_sources,
-                allow_partial=allow_partial,
-                min_coverage=min_coverage,
-                min_word_hits=min_word_hits,
-                coverage_weight=coverage_weight,
-            )
-        )
     async def close(self) -> None:
         """
-        Close the connection pool and any owned resources.
+        Close the engine and any owned resources.
-        Note: Only call this if you want to close the pool. If the pool
+        Note: Only call this if you want to close the engine. If the engine
         is managed externally, the caller should close it instead.
         """
-        await self._pool.close()
+        await self._engine.dispose()
         if self._owned_resource is not None:
             await self._owned_resource.close()
-    def close_sync(self) -> None:
-        """
-        Synchronously close the connection pool and event loop.
-        Use this when the normalizer was created with create_sync().
-        """
-        if self._loop is None:
-            raise RuntimeError("close_sync() requires normalizer created with create_sync()")
-        self._loop.run_until_complete(self._pool.close())
-        self._loop.close()

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/README.md RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/__init__.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_merged.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_ontology.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/build_umls.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/constants.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/models.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/normalizer.py RENAMED Viewed

File without changes

{norm_toolkit-1.0.2 → norm_toolkit-1.1.0}/src/norm_toolkit/utils.py RENAMED Viewed

File without changes

norm_toolkit 1.0.2__tar.gz → 1.1.0__tar.gz

norm_toolkit 1.0.2tar.gz → 1.1.0tar.gz