PyPI - orm-loader - Versions diffs - 0.3.0__tar.gz - Mend

orm-loader 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

orm_loader-0.3.0/PKG-INFO +162 -0
orm_loader-0.3.0/README.md +149 -0
orm_loader-0.3.0/pyproject.toml +37 -0
orm_loader-0.3.0/src/orm_loader/__init__.py +0 -0
orm_loader-0.3.0/src/orm_loader/helpers/__init__.py +23 -0
orm_loader-0.3.0/src/orm_loader/helpers/bootstrap.py +13 -0
orm_loader-0.3.0/src/orm_loader/helpers/bulk.py +90 -0
orm_loader-0.3.0/src/orm_loader/helpers/discovery.py +11 -0
orm_loader-0.3.0/src/orm_loader/helpers/errors.py +6 -0
orm_loader-0.3.0/src/orm_loader/helpers/logging.py +90 -0
orm_loader-0.3.0/src/orm_loader/helpers/metadata.py +15 -0
orm_loader-0.3.0/src/orm_loader/helpers/sqlite.py +32 -0
orm_loader-0.3.0/src/orm_loader/loaders/__init__.py +14 -0
orm_loader-0.3.0/src/orm_loader/loaders/data_classes.py +147 -0
orm_loader-0.3.0/src/orm_loader/loaders/loader_interface.py +274 -0
orm_loader-0.3.0/src/orm_loader/loaders/loading_helpers.py +136 -0
orm_loader-0.3.0/src/orm_loader/py.typed +0 -0
orm_loader-0.3.0/src/orm_loader/registry/__init__.py +67 -0
orm_loader-0.3.0/src/orm_loader/registry/registry.py +167 -0
orm_loader-0.3.0/src/orm_loader/registry/validation.py +230 -0
orm_loader-0.3.0/src/orm_loader/registry/validation_presets.py +14 -0
orm_loader-0.3.0/src/orm_loader/registry/validation_report.py +88 -0
orm_loader-0.3.0/src/orm_loader/registry/validation_runner.py +36 -0
orm_loader-0.3.0/src/orm_loader/tables/__init__.py +25 -0
orm_loader-0.3.0/src/orm_loader/tables/base/__init__.py +15 -0
orm_loader-0.3.0/src/orm_loader/tables/base/allocators.py +22 -0
orm_loader-0.3.0/src/orm_loader/tables/base/loadable_table.py +420 -0
orm_loader-0.3.0/src/orm_loader/tables/base/orm_table.py +76 -0
orm_loader-0.3.0/src/orm_loader/tables/base/serialisable_table.py +48 -0
orm_loader-0.3.0/src/orm_loader/tables/base/typing.py +129 -0
orm_loader-0.3.0/src/orm_loader/tables/data/__init__.py +7 -0
orm_loader-0.3.0/src/orm_loader/tables/data/converters.py +106 -0
orm_loader-0.3.0/src/orm_loader/tables/data/data_type_management.py +110 -0

orm_loader-0.3.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,162 @@
+Metadata-Version: 2.3
+Name: orm-loader
+Version: 0.3.0
+Summary: Generic base classes to handle ORM functionality for multiple downstream datamodels
+Author: gkennos
+Author-email: gkennos <georgina.kennedy@unsw.edu.au>
+Requires-Dist: chardet>=5.2.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: pyarrow>=23.0.0
+Requires-Dist: sqlalchemy>=2.0.45
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+## orm-loader
+A lightweight, reusable foundation for building and validating SQLAlchemy-based clinical (and non-clinical) data models.
+This library provides general-purpose ORM infrastructure that sits below any specific data model (OMOP, PCORnet, custom CDMs, etc.), focusing on:
+* declarative base configuration
+* bulk ingestion patterns
+* file-based validation & loading
+* table introspection
+* model-agnostic validation scaffolding
+* safe, database-portable operational helpers
+It intentionally contains no domain logic and no assumptions about a specific schema.
+### What this library provides:
+This library provides a small set of composable building blocks for defining, loading, inspecting, and validating SQLAlchemy-based data models.
+All components are model-agnostic and can be selectively combined in downstream libraries.
+1. A minimal, opinionated ORM table base
+ORMTableBase provides structural introspection utilities for SQLAlchemy-mapped tables, without imposing any domain semantics.
+It supports:
+* mapper access and inspection
+* primary key discovery
+* required (non-nullable) column detection
+* consistent primary key handling across models
+* simple ID allocation helpers for sequence-less databases
+```python
+from orm_loader.tables import ORMTableBase
+class MyTable(ORMTableBase, Base):
+    __tablename__ = "my_table"
+```
+This base is intended to be inherited by all ORM tables, either directly or via higher-level mixins.
+2. CSV-based ingestion mixins
+CSVLoadableTableInterface adds opt-in CSV loading support for ORM tables using pandas, with a focus on correctness and scalability.
+Features include:
+* chunked loading for large files
+* optional per-table normalisation logic
+* optional deduplication against existing database rows
+* safe bulk inserts using SQLAlchemy sessions
+```python
+class MyTable(CSVLoadableTableInterface, ORMTableBase, Base):
+    __tablename__ = "my_table"
+```
+Downstream models may override:
+* normalise_dataframe(...)
+* dedupe_dataframe(...)
+* csv_columns()
+to implement table-specific ingestion policies.
+3. Structured serialisation and hashing
+SerialisableTableInterface adds lightweight, explicit serialisation helpers for ORM rows.
+It supports:
+* conversion to dictionaries
+* JSON serialisation
+* stable row-level fingerprints
+* iterator-style access to field/value pairs
+```python
+row = session.get(MyTable, 1)
+row.to_dict()
+row.to_json()
+row.fingerprint()
+```
+This is useful for:
+* debugging
+* auditing
+* reproducibility checks
+* downstream APIs or exports
+4. Model registry and validation scaffolding
+The library includes model-agnostic validation infrastructure, designed to compare ORM models against external specifications.
+This includes:
+* a model registry
+* table and field descriptors
+* validator contracts
+* a validation runner
+* structured validation reports
+Specifications can be loaded from CSV today, with support for other formats (e.g. LinkML) planned.
+```python
+registry = ModelRegistry(model_version="1.0")
+registry.load_table_specs(table_csv, field_csv)
+registry.register_models([MyTable])
+runner = ValidationRunner(validators=always_on_validators())
+report = runner.run(registry)
+```
+Validation output is available as:
+* human-readable text
+* structured dictionaries
+* JSON (for CI/CD integration)
+* exit codes suitable for pipelines
+5. Database bootstrap helpers
+The library provides lightweight helpers for schema creation and bootstrapping, without imposing a migration strategy.
+```python
+from orm_loader.metadata import Base
+from orm_loader.bootstrap import bootstrap
+bootstrap(engine, create=True)
+```
+6. Safe bulk-loading utilities
+A reusable context manager simplifies trusted bulk ingestion workflows:
+* temporarily disables foreign key checks where supported
+* suppresses autoflush for performance
+* ensures reliable rollback on failure
+## Summary
+This library intentionally focuses on infrastructure, not semantics.
+It provides:
+* reusable ORM mixins
+* safe ingestion patterns
+* validation scaffolding
+* database-portable utilities
+while leaving domain rules, business logic, and schema semantics to downstream libraries.
+This makes it suitable as a shared foundation for:
+* clinical data models
+* research data marts
+* registry schemas
+* synthetic data pipelines

orm_loader-0.3.0/README.md ADDED Viewed

@@ -0,0 +1,149 @@
+## orm-loader
+A lightweight, reusable foundation for building and validating SQLAlchemy-based clinical (and non-clinical) data models.
+This library provides general-purpose ORM infrastructure that sits below any specific data model (OMOP, PCORnet, custom CDMs, etc.), focusing on:
+* declarative base configuration
+* bulk ingestion patterns
+* file-based validation & loading
+* table introspection
+* model-agnostic validation scaffolding
+* safe, database-portable operational helpers
+It intentionally contains no domain logic and no assumptions about a specific schema.
+### What this library provides:
+This library provides a small set of composable building blocks for defining, loading, inspecting, and validating SQLAlchemy-based data models.
+All components are model-agnostic and can be selectively combined in downstream libraries.
+1. A minimal, opinionated ORM table base
+ORMTableBase provides structural introspection utilities for SQLAlchemy-mapped tables, without imposing any domain semantics.
+It supports:
+* mapper access and inspection
+* primary key discovery
+* required (non-nullable) column detection
+* consistent primary key handling across models
+* simple ID allocation helpers for sequence-less databases
+```python
+from orm_loader.tables import ORMTableBase
+class MyTable(ORMTableBase, Base):
+    __tablename__ = "my_table"
+```
+This base is intended to be inherited by all ORM tables, either directly or via higher-level mixins.
+2. CSV-based ingestion mixins
+CSVLoadableTableInterface adds opt-in CSV loading support for ORM tables using pandas, with a focus on correctness and scalability.
+Features include:
+* chunked loading for large files
+* optional per-table normalisation logic
+* optional deduplication against existing database rows
+* safe bulk inserts using SQLAlchemy sessions
+```python
+class MyTable(CSVLoadableTableInterface, ORMTableBase, Base):
+    __tablename__ = "my_table"
+```
+Downstream models may override:
+* normalise_dataframe(...)
+* dedupe_dataframe(...)
+* csv_columns()
+to implement table-specific ingestion policies.
+3. Structured serialisation and hashing
+SerialisableTableInterface adds lightweight, explicit serialisation helpers for ORM rows.
+It supports:
+* conversion to dictionaries
+* JSON serialisation
+* stable row-level fingerprints
+* iterator-style access to field/value pairs
+```python
+row = session.get(MyTable, 1)
+row.to_dict()
+row.to_json()
+row.fingerprint()
+```
+This is useful for:
+* debugging
+* auditing
+* reproducibility checks
+* downstream APIs or exports
+4. Model registry and validation scaffolding
+The library includes model-agnostic validation infrastructure, designed to compare ORM models against external specifications.
+This includes:
+* a model registry
+* table and field descriptors
+* validator contracts
+* a validation runner
+* structured validation reports
+Specifications can be loaded from CSV today, with support for other formats (e.g. LinkML) planned.
+```python
+registry = ModelRegistry(model_version="1.0")
+registry.load_table_specs(table_csv, field_csv)
+registry.register_models([MyTable])
+runner = ValidationRunner(validators=always_on_validators())
+report = runner.run(registry)
+```
+Validation output is available as:
+* human-readable text
+* structured dictionaries
+* JSON (for CI/CD integration)
+* exit codes suitable for pipelines
+5. Database bootstrap helpers
+The library provides lightweight helpers for schema creation and bootstrapping, without imposing a migration strategy.
+```python
+from orm_loader.metadata import Base
+from orm_loader.bootstrap import bootstrap
+bootstrap(engine, create=True)
+```
+6. Safe bulk-loading utilities
+A reusable context manager simplifies trusted bulk ingestion workflows:
+* temporarily disables foreign key checks where supported
+* suppresses autoflush for performance
+* ensures reliable rollback on failure
+## Summary
+This library intentionally focuses on infrastructure, not semantics.
+It provides:
+* reusable ORM mixins
+* safe ingestion patterns
+* validation scaffolding
+* database-portable utilities
+while leaving domain rules, business logic, and schema semantics to downstream libraries.
+This makes it suitable as a shared foundation for:
+* clinical data models
+* research data marts
+* registry schemas
+* synthetic data pipelines

orm_loader-0.3.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,37 @@
+[project]
+name = "orm-loader"
+version = "0.3.0"
+description = "Generic base classes to handle ORM functionality for multiple downstream datamodels"
+readme = "README.md"
+authors = [
+    { name = "gkennos", email = "georgina.kennedy@unsw.edu.au" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "chardet>=5.2.0",
+    "pandas>=2.3.3",
+    "pyarrow>=23.0.0",
+    "sqlalchemy>=2.0.45",
+]
+[build-system]
+requires = ["uv_build>=0.9.2,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "mypy>=1.19.1",
+    "pytest>=9.0.2",
+    "ruff>=0.14.11",
+]
+[tool.setuptools]
+packages = ["orm_loader"]
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+[tool.mypy]
+python_version = "3.11"
+strict = true

orm_loader-0.3.0/src/orm_loader/__init__.py ADDED Viewed

File without changes

orm_loader-0.3.0/src/orm_loader/helpers/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .errors import IngestError, ValidationError
+from .logging import get_logger, configure_logging
+from .bootstrap import bootstrap, create_db
+from .sqlite import enable_sqlite_foreign_keys, explain_sqlite_fk_error
+from .bulk import bulk_load_context, engine_with_replica_role
+from .metadata import Base
+from .discovery import get_model_by_tablename
+__all__ = [
+    "IngestError",
+    "ValidationError",
+    "get_logger",
+    "configure_logging",
+    "bootstrap",
+    "create_db",
+    "enable_sqlite_foreign_keys",
+    "explain_sqlite_fk_error",
+    "bulk_load_context",
+    "engine_with_replica_role",
+    "Base",
+    "get_model_by_tablename",
+]

orm_loader-0.3.0/src/orm_loader/helpers/bootstrap.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .metadata import Base
+import logging
+logger = logging.getLogger(__name__)
+def create_db(engine):
+    logger.debug("Creating database schema")
+    Base.metadata.create_all(engine)
+def bootstrap(engine, *, create: bool = True):
+    logger.info("Bootstrapping schema (create=%s)", create)
+    if create:
+        create_db(engine)

orm_loader-0.3.0/src/orm_loader/helpers/bulk.py ADDED Viewed

@@ -0,0 +1,90 @@
+from contextlib import contextmanager
+from sqlalchemy import text, Engine
+from sqlalchemy.orm import Session
+import sqlalchemy as sa
+from .logging import get_logger
+logger = get_logger(__name__)
+@contextmanager
+def bulk_load_context(
+    session: Session,
+    *,
+    disable_fk: bool = True,
+    no_autoflush: bool = True,
+):
+    engine = session.get_bind()
+    dialect = engine.dialect.name
+    fk_disabled = False
+    try:
+        if disable_fk:
+            if dialect == "postgresql":
+                session.execute(text(
+                    "SET session_replication_role = replica"
+                ))
+                fk_disabled = True
+            elif dialect == "sqlite":
+                session.execute(text("PRAGMA foreign_keys = OFF"))
+                fk_disabled = True
+            logger.info("Disabled foreign key checks for bulk load")
+        if no_autoflush:
+            with session.no_autoflush:
+                yield
+        else:
+            yield
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        if fk_disabled:
+            if dialect == "postgresql":
+                session.execute(text(
+                    "SET session_replication_role = DEFAULT"
+                ))
+            elif dialect == "sqlite":
+                session.execute(text("PRAGMA foreign_keys = ON"))
+            logger.info("Re-enabled foreign key checks after bulk load")
+@contextmanager
+def engine_with_replica_role(engine: Engine):
+    """
+    Context manager that:
+    - forces session_replication_role=replica on all connections
+    - restores DEFAULT on exit
+    this is different to bulk_load_context manager from orm_loader.helpers
+    because this is engine scoped where that one is session scoped
+    postgres only
+    """
+    @sa.event.listens_for(engine, "connect") # type: ignore
+    def _set_replica_role(dbapi_conn, _):
+        cur = dbapi_conn.cursor()
+        cur.execute("SET session_replication_role = replica")
+        cur.close()
+    try:
+        yield engine
+    finally:
+        # Explicitly restore on a fresh connection
+        with engine.connect() as conn:
+            conn = conn.execution_options(isolation_level="AUTOCOMMIT")
+            conn.execute(text("SET session_replication_role = DEFAULT"))
+            role = conn.execute(
+                text("SHOW session_replication_role")
+            ).scalar()
+            if role != "origin":
+                raise RuntimeError(
+                    "Failed to restore session_replication_role"
+                )
+        logger.info("session_replication_role restored to DEFAULT")

orm_loader-0.3.0/src/orm_loader/helpers/discovery.py ADDED Viewed

@@ -0,0 +1,11 @@
+from typing import Type
+from .metadata import Base
+def get_model_by_tablename(tablename: str, base: Type[Base] | None = None) -> Type | None:
+    tablename = tablename.lower().strip()
+    if base is None:
+        base = Base
+    for cls in base.__subclasses__():
+        if getattr(cls, "__tablename__", None) == tablename:
+            return cls
+    return None

orm_loader-0.3.0/src/orm_loader/helpers/errors.py ADDED Viewed

@@ -0,0 +1,6 @@
+class IngestError(Exception):
+    """Raised when ingestion fails for structural or runtime reasons."""
+class ValidationError(Exception):
+    """Raised when schema or specification validation fails."""

orm_loader-0.3.0/src/orm_loader/helpers/logging.py ADDED Viewed

@@ -0,0 +1,90 @@
+from __future__ import annotations
+import logging
+from typing import Literal, Optional
+import re
+SENSITIVE_KEYS = {
+    "password",
+    "passwd",
+    "secret",
+    "token",
+    "key",
+    "dsn",
+    "uri",
+    "url",
+}
+LOGGING_NAMESPACE = "sql_loader"
+def _coerce_log_level(level: int | str) -> int:
+    if isinstance(level, int):
+        return level
+    if isinstance(level, str):
+        s = level.strip().upper()
+        if s.isdigit():
+            return int(s)
+        mapping = logging.getLevelNamesMapping()
+        if s in mapping:
+            return mapping[s]
+        raise ValueError(f"Invalid log level: {level!r}")
+    raise TypeError(f"Invalid log level type: {type(level)}")
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a namespaced logger.
+    Examples:
+        get_logger() -> sql_loader
+        get_logger("loadable_table") -> sql_loader.loadable_table
+    """
+    full_name = LOGGING_NAMESPACE if name is None else f"{LOGGING_NAMESPACE}.{name}"
+    return logging.getLogger(full_name)
+class RedactingFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._pattern = re.compile(
+            r"(?i)\\b(" + "|".join(SENSITIVE_KEYS) + r")\\b\\s*[:=]\\s*[^\\s,;]+"
+        )
+    def format(self, record):
+        msg = super().format(record)
+        return self._pattern.sub(r"\\1=<REDACTED>", msg)
+def configure_logging(
+    *,
+    level: int | str = logging.INFO,
+    handler: Optional[logging.Handler] = None,
+    format: Optional[str] = None,
+    propagate: bool = True,
+    redact: bool = True,
+) -> None:
+    """
+    Enable logging output for omop_alchemy.
+    Safe to call multiple times.
+    """
+    logger = get_logger()
+    logger.setLevel(_coerce_log_level(level))
+    if handler is None:
+        handler = logging.StreamHandler()
+    if format is None:
+        format = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
+    formatter_cls = RedactingFormatter if redact else logging.Formatter
+    handler.setFormatter(formatter_cls(format))
+    if not any(isinstance(h, type(handler)) for h in logger.handlers):
+        logger.addHandler(handler)
+    logger.propagate = propagate
+logging.getLogger(LOGGING_NAMESPACE).addHandler(logging.NullHandler())

orm_loader-0.3.0/src/orm_loader/helpers/metadata.py ADDED Viewed

@@ -0,0 +1,15 @@
+from sqlalchemy import MetaData
+from sqlalchemy.orm import DeclarativeBase
+NAMING_CONVENTIONS = {
+    "ix": "ix_%(column_0_label)s",
+    "uq": "uq_%(table_name)s_%(column_0_name)s",
+    "ck": "ck_%(table_name)s_%(constraint_name)s",
+    "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
+    "pk": "pk_%(table_name)s",
+}
+metadata = MetaData(naming_convention=NAMING_CONVENTIONS)
+class Base(DeclarativeBase):
+    metadata = metadata

orm_loader-0.3.0/src/orm_loader/helpers/sqlite.py ADDED Viewed

@@ -0,0 +1,32 @@
+from sqlalchemy import event, text
+from sqlalchemy.engine import Engine
+from sqlalchemy.exc import IntegrityError
+import logging
+logger = logging.getLogger(__name__)
+@event.listens_for(Engine, "connect")
+def enable_sqlite_foreign_keys(dbapi_connection, connection_record):
+    if dbapi_connection.__class__.__module__.startswith("sqlite3"):
+        logger.debug("Enabling SQLite foreign key enforcement")
+        cursor = dbapi_connection.cursor()
+        cursor.execute("PRAGMA defer_foreign_keys = ON;")
+        cursor.close()
+def explain_sqlite_fk_error(session, exc: IntegrityError, raise_error: bool = True):
+    engine = session.get_bind()
+    if engine.dialect.name != "sqlite":
+        raise exc
+    with engine.connect() as conn:
+        rows = conn.execute(text("PRAGMA foreign_key_check")).fetchall()
+    if rows:
+        for r in rows:
+            logger.error(
+                "FK violation: table=%s rowid=%s references=%s fk_index=%s",
+                r[0], r[1], r[2], r[3]
+            )
+    if raise_error:
+        raise exc

orm_loader-0.3.0/src/orm_loader/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .loader_interface import LoaderInterface, PandasLoader, ParquetLoader
+from .data_classes import LoaderContext, TableCastingStats
+from .loading_helpers import infer_delim, infer_encoding, quick_load_pg
+__all__ = [
+    "LoaderInterface",
+    "LoaderContext",
+    "PandasLoader",
+    "TableCastingStats",
+    "infer_delim",
+    "infer_encoding",
+    "quick_load_pg",
+    "ParquetLoader",
+]