PyPI - data-transfer-lib - Versions diffs - 0.1.2__tar.gz → 2.0.0__tar.gz - Mend

data-transfer-lib 0.1.2tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

data_transfer_lib-2.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,31 @@
+Metadata-Version: 2.4
+Name: data-transfer-lib
+Version: 2.0.0
+Summary: Library for data transfer between databases using PySpark
+Author-email: llirikh <zhukov.kg@phystech.edu>
+License: MIT
+Project-URL: Homepage, https://github.com/llirikh/data_transfer_lib
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Database
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: pyspark>=3.3.0
+Provides-Extra: prometheus
+Requires-Dist: prometheus_client>=0.20; extra == "prometheus"
+Provides-Extra: gx
+Requires-Dist: great_expectations>=0.18; extra == "gx"
+Provides-Extra: yaml
+Requires-Dist: pyyaml>=6.0; extra == "yaml"
+Provides-Extra: pydantic
+Requires-Dist: pydantic>=2.0; extra == "pydantic"
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == "dev"
+# Project description
+Project description will be here

data_transfer_lib-2.0.0/data_transfer_lib/__init__.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""data_transfer_lib v2 — public façade.
+The new architecture (see ``architecture/`` at the repo root) is being landed
+in phases. Phase 1 ships the skeleton: contracts, registries, lifecycle bus.
+Concrete strategies (Postgres, ClickHouse, Kafka, …) ship in Phase 3+.
+"""
+from data_transfer_lib._bootstrap import _load_builtins, _load_entry_points
+_load_builtins()
+_load_entry_points()
+# Public façade -------------------------------------------------------------
+from data_transfer_lib.api.reader import Reader
+from data_transfer_lib.api.writer import Writer
+from data_transfer_lib.api.engine_pipeline import EnginePipeline
+from data_transfer_lib.connections import ClickHouse, Kafka, Postgres
+from data_transfer_lib.core.connection import Connection
+from data_transfer_lib.extensions.logging import LoggingExtension
+from data_transfer_lib.core.io import (
+    Mode,
+    ReadOptions,
+    Source,
+    Target,
+    WriteOptions,
+)
+from data_transfer_lib.core.lifecycle import Extension, hook
+from data_transfer_lib.core.exceptions import (
+    AuthenticationError,
+    BatchReadError,
+    BatchWriteError,
+    ConfigurationError,
+    ConnectionError,
+    ConnectionRefusedError,
+    DataTransferError,
+    DDLExecutionError,
+    DriverError,
+    ExtensionError,
+    ExtraColumnError,
+    HookError,
+    IncompatibleTypeError,
+    InvalidOptionError,
+    MissingColumnError,
+    NativePipelineError,
+    NullabilityError,
+    PipelineDeployError,
+    PipelineTeardownError,
+    PluginRegistrationError,
+    ReadError,
+    SchemaError,
+    SchemaIntrospectionError,
+    SchemaValidationError,
+    StreamingReadError,
+    StreamingWriteError,
+    TimeoutError,
+    TypeMappingError,
+    UnsupportedModeError,
+    WriteError,
+    WriteModeConflictError,
+    ErrorContext,
+    # Deprecation aliases (removed in 2.2)
+    ConnectionException,
+    DataTransferException,
+    SchemaValidationException,
+    TypeMappingException,
+)
+from data_transfer_lib.registry import (
+    register_engine_pipeline,
+    register_extension,
+    register_source,
+    register_target,
+    register_type_mapper,
+    unregister_extension,
+)
+__version__ = "2.0.0"
+__all__ = [
+    "__version__",
+    # API
+    "Reader",
+    "Writer",
+    "EnginePipeline",
+    "Mode",
+    "ReadOptions",
+    "WriteOptions",
+    "Source",
+    "Target",
+    "Connection",
+    # Connections
+    "Postgres",
+    "ClickHouse",
+    "Kafka",
+    # Lifecycle
+    "Extension",
+    "hook",
+    "LoggingExtension",
+    # Registration
+    "register_source",
+    "register_target",
+    "register_engine_pipeline",
+    "register_type_mapper",
+    "register_extension",
+    "unregister_extension",
+    # Exceptions
+    "DataTransferError",
+    "ErrorContext",
+    "ConfigurationError",
+    "UnsupportedModeError",
+    "InvalidOptionError",
+    "PluginRegistrationError",
+    "ConnectionError",
+    "ConnectionRefusedError",
+    "AuthenticationError",
+    "DriverError",
+    "TimeoutError",
+    "SchemaError",
+    "SchemaIntrospectionError",
+    "SchemaValidationError",
+    "MissingColumnError",
+    "ExtraColumnError",
+    "IncompatibleTypeError",
+    "NullabilityError",
+    "TypeMappingError",
+    "ReadError",
+    "BatchReadError",
+    "StreamingReadError",
+    "WriteError",
+    "BatchWriteError",
+    "StreamingWriteError",
+    "WriteModeConflictError",
+    "NativePipelineError",
+    "DDLExecutionError",
+    "PipelineDeployError",
+    "PipelineTeardownError",
+    "ExtensionError",
+    "HookError",
+    # Deprecated
+    "DataTransferException",
+    "ConnectionException",
+    "SchemaValidationException",
+    "TypeMappingException",
+]

data_transfer_lib-2.0.0/data_transfer_lib/_bootstrap.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""One-shot bootstrap: imports in-tree strategy modules (registering them
+as a side effect), wires the default ``LoggingExtension``, and walks the
+``data_transfer_lib.connectors`` entry-points group for third-party
+connectors.
+Phase 3+: ``strategies/__init__.py`` imports each batch/streaming/engine
+module so their bottom-of-file ``register_*`` calls fire on first import
+of the package.
+"""
+from __future__ import annotations
+from data_transfer_lib.core.exceptions import PluginRegistrationError
+from data_transfer_lib.core.logging import get_logger
+_log = get_logger(__name__)
+def _load_builtins() -> None:
+    """Import in-tree strategy modules to trigger their registration, then
+    install the default ``LoggingExtension`` if it isn't already on the bus."""
+    try:
+        import data_transfer_lib.strategies  # noqa: F401
+    except ImportError as exc:  # pragma: no cover - dev-only
+        _log.debug("strategies package import failed: %s", exc)
+    from data_transfer_lib.extensions.logging import LoggingExtension
+    from data_transfer_lib.registry import HOOK_BUS, register_extension
+    default = LoggingExtension.default()
+    if default not in HOOK_BUS.extensions:
+        register_extension(default)
+def _load_entry_points() -> None:
+    """Walk the ``data_transfer_lib.connectors`` entry-points group."""
+    try:
+        from importlib.metadata import entry_points
+    except ImportError:  # pragma: no cover - py < 3.10 unsupported
+        return
+    try:
+        eps = entry_points(group="data_transfer_lib.connectors")
+    except TypeError:  # pragma: no cover - very old Python
+        eps = entry_points().get("data_transfer_lib.connectors", [])  # type: ignore[union-attr]
+    for ep in eps:
+        try:
+            register = ep.load()
+        except Exception as exc:  # noqa: BLE001
+            _log.exception("failed to load entry point %s", ep)
+            raise PluginRegistrationError(
+                f"failed to load entry point {ep.name!r}"
+            ) from exc
+        try:
+            register()
+        except Exception as exc:  # noqa: BLE001
+            raise PluginRegistrationError(
+                f"entry point {ep.name!r} raised during register()"
+            ) from exc
+__all__ = ["_load_builtins", "_load_entry_points"]

data_transfer_lib-2.0.0/data_transfer_lib/api/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Public façade (L6): Reader, Writer, EnginePipeline."""
+from data_transfer_lib.api.reader import Reader
+from data_transfer_lib.api.writer import Writer
+from data_transfer_lib.api.engine_pipeline import EnginePipeline
+__all__ = ["Reader", "Writer", "EnginePipeline"]

data_transfer_lib-2.0.0/data_transfer_lib/api/engine_pipeline.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""User-facing wrapper around concrete ``EnginePipeline`` strategies.
+The base class enumerates **no** specific kinds. Adding a new pipeline is a
+new file under ``strategies/engine/`` that registers itself via
+``register_engine_pipeline(...)``.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from data_transfer_lib.core.connection import Connection
+from data_transfer_lib.core.io import EnginePipeline as _EnginePipelineBase
+from data_transfer_lib.core.lifecycle import (
+    ON_ENGINE_DEPLOY,
+    ON_ENGINE_TEARDOWN,
+)
+from data_transfer_lib.registry import ENGINE_PIPELINE_REGISTRY, HOOK_BUS
+class EnginePipeline(_EnginePipelineBase):
+    """User-facing engine pipeline base. Use ``EnginePipeline.create(...)``
+    to construct a concrete pipeline through the registry."""
+    @classmethod
+    def create(
+        cls,
+        *,
+        source: Connection,
+        target: Connection,
+        kind: Optional[str] = None,
+        deploy_mode: str = "create_if_missing",
+        **options: Any,
+    ) -> "EnginePipeline":
+        impl_cls = ENGINE_PIPELINE_REGISTRY.get(
+            source_cls=type(source), target_cls=type(target), kind=kind
+        )
+        return impl_cls(
+            source=source, target=target, deploy_mode=deploy_mode, **options
+        )
+    def deploy(self) -> None:  # pragma: no cover - default for subclasses
+        raise NotImplementedError
+    def teardown(self) -> None:  # pragma: no cover
+        raise NotImplementedError
+    # Convenience wrappers that fire lifecycle hooks ------------------------
+    def _fire_deploy(self) -> None:
+        HOOK_BUS.fire(ON_ENGINE_DEPLOY, self)
+    def _fire_teardown(self) -> None:
+        HOOK_BUS.fire(ON_ENGINE_TEARDOWN, self)
+__all__ = ["EnginePipeline"]

data_transfer_lib-2.0.0/data_transfer_lib/api/reader.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""Thin Reader façade — dispatches to a strategy registered in
+``SourceRegistry``."""
+from __future__ import annotations
+from typing import Any, Mapping, Optional, Sequence, Union
+from data_transfer_lib.core.connection import Connection
+from data_transfer_lib.core.exceptions import ErrorContext, HookError
+from data_transfer_lib.core.io import Mode, ReadOptions
+from data_transfer_lib.core.lifecycle import (
+    AFTER_READ,
+    BEFORE_READ,
+    ON_ERROR,
+    Extension,
+    LifecycleHookBus,
+)
+from data_transfer_lib.registry import HOOK_BUS, SOURCE_REGISTRY
+class Reader:
+    """Read a source via the strategy registered for ``(connection, mode)``.
+    Construction validates the (connection, mode) combination via the
+    registry. Schema introspection and the actual read are deferred to
+    ``read()`` so constructors stay cheap.
+    """
+    def __init__(
+        self,
+        connection: Connection,
+        source: str,
+        *,
+        mode: Union[Mode, str] = Mode.BATCH,
+        options: Union[ReadOptions, Mapping[str, Any], None] = None,
+        validate_schema: bool = True,
+        extensions: Optional[Sequence[Extension]] = None,
+    ) -> None:
+        self.connection = connection
+        self.source = source
+        self.mode = Mode.coerce(mode)
+        self.options = ReadOptions.coerce(options)
+        self.validate_schema = validate_schema
+        self._strategy_cls = SOURCE_REGISTRY.get(connection, self.mode)
+        self._local_bus = LifecycleHookBus()
+        for ext in extensions or ():
+            self._local_bus.register(ext)
+        self._strategy: Any = None
+        self._source_schema: Any = None
+    def introspect(self) -> Any:
+        """Return the canonical source schema. Cached after first call."""
+        if self._source_schema is None:
+            strategy = self._ensure_strategy()
+            self._source_schema = strategy.introspect()
+            if self.options.type_overrides:
+                self._source_schema = _apply_type_overrides(
+                    self._source_schema, self.options.type_overrides
+                )
+        return self._source_schema
+    def read(self) -> Any:
+        strategy = self._ensure_strategy()
+        if self.validate_schema:
+            # Surfaces SchemaIntrospectionError eagerly per architecture/04 Ex.8.
+            self.introspect()
+        try:
+            HOOK_BUS.fire(BEFORE_READ, self)
+            self._local_bus.fire(BEFORE_READ, self)
+            df = strategy.read()
+        except HookError:
+            raise
+        except Exception as exc:
+            self._fire_on_error(exc)
+            raise
+        HOOK_BUS.fire(AFTER_READ, self, df)
+        self._local_bus.fire(AFTER_READ, self, df)
+        return df
+    def _ensure_strategy(self) -> Any:
+        if self._strategy is None:
+            self._strategy = self._strategy_cls(
+                connection=self.connection,
+                source=self.source,
+                options=self.options,
+            )
+        return self._strategy
+    def _fire_on_error(self, exc: BaseException) -> None:
+        ctx = ErrorContext(
+            connection=type(self.connection).__name__,
+            source=self.source,
+            mode=self.mode.value,
+        )
+        try:
+            HOOK_BUS.fire(ON_ERROR, self, exc, ctx)
+            self._local_bus.fire(ON_ERROR, self, exc, ctx)
+        except Exception:  # noqa: BLE001 - on_error swallows itself
+            pass
+def _apply_type_overrides(schema: Any, overrides: Mapping[str, Any]) -> Any:
+    from data_transfer_lib.schema.canonical import CanonicalField, CanonicalSchema
+    return CanonicalSchema(
+        fields=tuple(
+            CanonicalField(
+                name=f.name,
+                type=overrides.get(f.name, f.type),
+                nullable=f.nullable,
+                description=f.description,
+            )
+            for f in schema.fields
+        )
+    )
+__all__ = ["Reader"]

data_transfer_lib-2.0.0/data_transfer_lib/api/writer.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""Thin Writer façade — dispatches to a strategy registered in
+``TargetRegistry``."""
+from __future__ import annotations
+from typing import Any, Mapping, Optional, Sequence, Union
+from data_transfer_lib.core.connection import Connection
+from data_transfer_lib.core.exceptions import (
+    ErrorContext,
+    HookError,
+    SchemaIntrospectionError,
+)
+from data_transfer_lib.core.io import Mode, WriteOptions
+from data_transfer_lib.core.lifecycle import (
+    AFTER_WRITE,
+    BEFORE_WRITE,
+    ON_ERROR,
+    Extension,
+    LifecycleHookBus,
+)
+from data_transfer_lib.registry import HOOK_BUS, TARGET_REGISTRY
+WriteMode = str  # one of: append, overwrite, create_if_not_exists, upsert
+class Writer:
+    """Write to a target via the strategy registered for ``(connection, mode)``."""
+    def __init__(
+        self,
+        connection: Connection,
+        target: str,
+        *,
+        mode: Union[Mode, str] = Mode.BATCH,
+        write_mode: WriteMode = "append",
+        options: Union[WriteOptions, Mapping[str, Any], None] = None,
+        validate_schema: bool = True,
+        extensions: Optional[Sequence[Extension]] = None,
+    ) -> None:
+        self.connection = connection
+        self.target = target
+        self.mode = Mode.coerce(mode)
+        self.write_mode = write_mode
+        self.options = WriteOptions.coerce(options)
+        self.validate_schema = validate_schema
+        self._strategy_cls = TARGET_REGISTRY.get(connection, self.mode)
+        self._local_bus = LifecycleHookBus()
+        for ext in extensions or ():
+            self._local_bus.register(ext)
+        self._strategy: Any = None
+    def write(self, df: Any) -> Any:
+        strategy = self._strategy_cls(
+            connection=self.connection,
+            target=self.target,
+            write_mode=self.write_mode,
+            options=self.options,
+        )
+        self._strategy = strategy
+        if self.validate_schema:
+            self._validate(df, strategy)
+        try:
+            HOOK_BUS.fire(BEFORE_WRITE, self, df)
+            self._local_bus.fire(BEFORE_WRITE, self, df)
+            result = strategy.write(df)
+        except HookError:
+            raise
+        except Exception as exc:
+            self._fire_on_error(exc)
+            raise
+        HOOK_BUS.fire(AFTER_WRITE, self, result)
+        self._local_bus.fire(AFTER_WRITE, self, result)
+        return result
+    def _validate(self, df: Any, strategy: Any) -> None:
+        from data_transfer_lib.schema.mappers.spark import SparkTypeMapper
+        from data_transfer_lib.schema.validator import SchemaValidator
+        df_schema = SparkTypeMapper().schema_to_canonical(df.schema)
+        if self.options.type_overrides:
+            df_schema = _apply_type_overrides(df_schema, self.options.type_overrides)
+        # ``create_if_not_exists`` skips target validation when the table
+        # doesn't exist yet — the strategy creates it from df_schema.
+        # ``overwrite`` also skips validation on a missing table and lets
+        # the strategy raise WriteModeConflictError.
+        try:
+            target_schema = strategy.introspect()
+        except SchemaIntrospectionError:
+            if self.write_mode in ("create_if_not_exists", "overwrite"):
+                return
+            raise
+        report = SchemaValidator.validate(df_schema, target_schema, mode="loose")
+        # For append/overwrite the DB engine enforces nullability at write time.
+        # Spark JDBC reports all columns as nullable regardless of source
+        # constraints, so suppressing these errors avoids false positives.
+        if self.write_mode in ("append", "overwrite"):
+            report.errors = [e for e in report.errors if e.code != "nullable"]
+        report.raise_if_errors()
+    def _fire_on_error(self, exc: BaseException) -> None:
+        ctx = ErrorContext(
+            connection=type(self.connection).__name__,
+            source=self.target,
+            mode=self.mode.value,
+        )
+        try:
+            HOOK_BUS.fire(ON_ERROR, self, exc, ctx)
+            self._local_bus.fire(ON_ERROR, self, exc, ctx)
+        except Exception:  # noqa: BLE001
+            pass
+def _apply_type_overrides(schema: Any, overrides: Mapping[str, Any]) -> Any:
+    from data_transfer_lib.schema.canonical import CanonicalField, CanonicalSchema
+    return CanonicalSchema(
+        fields=tuple(
+            CanonicalField(
+                name=f.name,
+                type=overrides.get(f.name, f.type),
+                nullable=f.nullable,
+                description=f.description,
+            )
+            for f in schema.fields
+        )
+    )
+__all__ = ["Writer"]

data_transfer_lib-2.0.0/data_transfer_lib/connections/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Per-system Connection implementations."""
+from data_transfer_lib.connections.clickhouse import ClickHouse
+from data_transfer_lib.connections.kafka import Kafka
+from data_transfer_lib.connections.postgres import Postgres
+__all__ = ["Postgres", "ClickHouse", "Kafka"]

data_transfer_lib-2.0.0/data_transfer_lib/connections/clickhouse.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Slim ClickHouse connection (v2).
+Holds the credentials, JDBC URL builder, and an optional Spark session.
+Schema introspection lives in ``schema/inspectors/clickhouse_jdbc.py``;
+write logic lives in ``strategies/batch/clickhouse_jdbc.py``.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Mapping, Optional
+from data_transfer_lib.core.connection import Connection
+from data_transfer_lib.core.exceptions import ConnectionError, ErrorContext
+JDBC_DRIVER = "com.clickhouse.jdbc.ClickHouseDriver"
+class ClickHouse(Connection):
+    def __init__(
+        self,
+        host: str,
+        user: str,
+        password: str,
+        database: str = "default",
+        port: int = 8123,
+        spark: Optional[Any] = None,
+        extra: Optional[Mapping[str, str]] = None,
+    ) -> None:
+        self.host = host
+        self.port = port
+        self.user = user
+        self.password = password
+        self.database = database
+        self.spark = spark
+        self.extra: Dict[str, str] = dict(extra or {})
+    def jdbc_url(self) -> str:
+        return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}"
+    def jdbc_properties(self) -> Dict[str, str]:
+        props: Dict[str, str] = {
+            "user": self.user,
+            "password": self.password,
+            "driver": JDBC_DRIVER,
+        }
+        props.update(self.extra)
+        return props
+    def test_connection(self) -> bool:
+        if self.spark is None:
+            raise ConnectionError(
+                "ClickHouse.test_connection requires a SparkSession",
+                context=ErrorContext(connection="ClickHouse"),
+            )
+        try:
+            (
+                self.spark.read.format("jdbc")
+                .option("url", self.jdbc_url())
+                .option("query", "SELECT 1")
+                .option("user", self.user)
+                .option("password", self.password)
+                .option("driver", JDBC_DRIVER)
+                .load()
+                .collect()
+            )
+        except Exception as exc:  # noqa: BLE001
+            raise ConnectionError(
+                f"ClickHouse connection check failed: {exc}",
+                context=ErrorContext(connection="ClickHouse"),
+            ) from exc
+        return True
+    def __repr__(self) -> str:  # pragma: no cover - cosmetic
+        return f"ClickHouse(host={self.host!r}, database={self.database!r})"
+__all__ = ["ClickHouse", "JDBC_DRIVER"]

data-transfer-lib 0.1.2__tar.gz → 2.0.0__tar.gz

data-transfer-lib 0.1.2tar.gz → 2.0.0tar.gz