data-transfer-lib 0.1.2__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data_transfer_lib-2.0.0/PKG-INFO +31 -0
  2. data_transfer_lib-2.0.0/data_transfer_lib/__init__.py +144 -0
  3. data_transfer_lib-2.0.0/data_transfer_lib/_bootstrap.py +63 -0
  4. data_transfer_lib-2.0.0/data_transfer_lib/api/__init__.py +7 -0
  5. data_transfer_lib-2.0.0/data_transfer_lib/api/engine_pipeline.py +57 -0
  6. data_transfer_lib-2.0.0/data_transfer_lib/api/reader.py +118 -0
  7. data_transfer_lib-2.0.0/data_transfer_lib/api/writer.py +132 -0
  8. data_transfer_lib-2.0.0/data_transfer_lib/connections/__init__.py +7 -0
  9. data_transfer_lib-2.0.0/data_transfer_lib/connections/clickhouse.py +78 -0
  10. data_transfer_lib-2.0.0/data_transfer_lib/connections/kafka.py +69 -0
  11. data_transfer_lib-2.0.0/data_transfer_lib/connections/postgres.py +78 -0
  12. data_transfer_lib-2.0.0/data_transfer_lib/core/__init__.py +1 -0
  13. data_transfer_lib-2.0.0/data_transfer_lib/core/connection.py +33 -0
  14. data_transfer_lib-2.0.0/data_transfer_lib/core/dq.py +66 -0
  15. data_transfer_lib-2.0.0/data_transfer_lib/core/exceptions.py +275 -0
  16. data_transfer_lib-2.0.0/data_transfer_lib/core/io.py +166 -0
  17. data_transfer_lib-2.0.0/data_transfer_lib/core/lifecycle.py +159 -0
  18. data_transfer_lib-2.0.0/data_transfer_lib/core/logging.py +25 -0
  19. data_transfer_lib-2.0.0/data_transfer_lib/core/metrics.py +39 -0
  20. data_transfer_lib-2.0.0/data_transfer_lib/extensions/__init__.py +5 -0
  21. data_transfer_lib-2.0.0/data_transfer_lib/extensions/logging.py +89 -0
  22. data_transfer_lib-2.0.0/data_transfer_lib/registry.py +263 -0
  23. data_transfer_lib-2.0.0/data_transfer_lib/schema/__init__.py +51 -0
  24. data_transfer_lib-2.0.0/data_transfer_lib/schema/canonical.py +160 -0
  25. data_transfer_lib-2.0.0/data_transfer_lib/schema/inspector.py +30 -0
  26. data_transfer_lib-2.0.0/data_transfer_lib/schema/inspectors/__init__.py +15 -0
  27. data_transfer_lib-2.0.0/data_transfer_lib/schema/inspectors/clickhouse_jdbc.py +89 -0
  28. data_transfer_lib-2.0.0/data_transfer_lib/schema/inspectors/postgres_jdbc.py +101 -0
  29. data_transfer_lib-2.0.0/data_transfer_lib/schema/mapper.py +62 -0
  30. data_transfer_lib-2.0.0/data_transfer_lib/schema/mappers/__init__.py +6 -0
  31. data_transfer_lib-2.0.0/data_transfer_lib/schema/mappers/clickhouse.py +205 -0
  32. data_transfer_lib-2.0.0/data_transfer_lib/schema/mappers/kafka.py +36 -0
  33. data_transfer_lib-2.0.0/data_transfer_lib/schema/mappers/postgres.py +227 -0
  34. data_transfer_lib-2.0.0/data_transfer_lib/schema/mappers/spark.py +246 -0
  35. data_transfer_lib-2.0.0/data_transfer_lib/schema/validator.py +339 -0
  36. data_transfer_lib-2.0.0/data_transfer_lib/strategies/__init__.py +16 -0
  37. data_transfer_lib-2.0.0/data_transfer_lib/strategies/batch/__init__.py +1 -0
  38. data_transfer_lib-2.0.0/data_transfer_lib/strategies/batch/clickhouse_jdbc.py +268 -0
  39. data_transfer_lib-2.0.0/data_transfer_lib/strategies/batch/postgres_jdbc.py +188 -0
  40. data_transfer_lib-2.0.0/data_transfer_lib/strategies/micro_batch/__init__.py +1 -0
  41. data_transfer_lib-2.0.0/data_transfer_lib/strategies/micro_batch/clickhouse_jdbc.py +160 -0
  42. data_transfer_lib-2.0.0/data_transfer_lib/strategies/micro_batch/kafka.py +146 -0
  43. data_transfer_lib-2.0.0/data_transfer_lib.egg-info/PKG-INFO +31 -0
  44. data_transfer_lib-2.0.0/data_transfer_lib.egg-info/SOURCES.txt +48 -0
  45. data_transfer_lib-2.0.0/data_transfer_lib.egg-info/requires.txt +16 -0
  46. data_transfer_lib-2.0.0/pyproject.toml +44 -0
  47. data_transfer_lib-0.1.2/PKG-INFO +0 -30
  48. data_transfer_lib-0.1.2/data_transfer_lib/__init__.py +0 -11
  49. data_transfer_lib-0.1.2/data_transfer_lib/connections/__init__.py +0 -4
  50. data_transfer_lib-0.1.2/data_transfer_lib/connections/base.py +0 -39
  51. data_transfer_lib-0.1.2/data_transfer_lib/connections/clickhouse.py +0 -68
  52. data_transfer_lib-0.1.2/data_transfer_lib/connections/postgres.py +0 -82
  53. data_transfer_lib-0.1.2/data_transfer_lib/reader/__init__.py +0 -3
  54. data_transfer_lib-0.1.2/data_transfer_lib/reader/base.py +0 -25
  55. data_transfer_lib-0.1.2/data_transfer_lib/reader/reader.py +0 -45
  56. data_transfer_lib-0.1.2/data_transfer_lib/schema/__init__.py +0 -4
  57. data_transfer_lib-0.1.2/data_transfer_lib/schema/mapper.py +0 -241
  58. data_transfer_lib-0.1.2/data_transfer_lib/schema/validator.py +0 -107
  59. data_transfer_lib-0.1.2/data_transfer_lib/utils/__init__.py +0 -13
  60. data_transfer_lib-0.1.2/data_transfer_lib/utils/exceptions.py +0 -14
  61. data_transfer_lib-0.1.2/data_transfer_lib/writer/__init__.py +0 -3
  62. data_transfer_lib-0.1.2/data_transfer_lib/writer/base.py +0 -27
  63. data_transfer_lib-0.1.2/data_transfer_lib/writer/writer.py +0 -72
  64. data_transfer_lib-0.1.2/data_transfer_lib.egg-info/PKG-INFO +0 -30
  65. data_transfer_lib-0.1.2/data_transfer_lib.egg-info/SOURCES.txt +0 -23
  66. data_transfer_lib-0.1.2/data_transfer_lib.egg-info/requires.txt +0 -1
  67. data_transfer_lib-0.1.2/setup.py +0 -29
  68. {data_transfer_lib-0.1.2 → data_transfer_lib-2.0.0}/README.md +0 -0
  69. {data_transfer_lib-0.1.2 → data_transfer_lib-2.0.0}/data_transfer_lib.egg-info/dependency_links.txt +0 -0
  70. {data_transfer_lib-0.1.2 → data_transfer_lib-2.0.0}/data_transfer_lib.egg-info/top_level.txt +0 -0
  71. {data_transfer_lib-0.1.2 → data_transfer_lib-2.0.0}/setup.cfg +0 -0
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-transfer-lib
3
+ Version: 2.0.0
4
+ Summary: Library for data transfer between databases using PySpark
5
+ Author-email: llirikh <zhukov.kg@phystech.edu>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/llirikh/data_transfer_lib
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Database
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: pyspark>=3.3.0
18
+ Provides-Extra: prometheus
19
+ Requires-Dist: prometheus_client>=0.20; extra == "prometheus"
20
+ Provides-Extra: gx
21
+ Requires-Dist: great_expectations>=0.18; extra == "gx"
22
+ Provides-Extra: yaml
23
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
24
+ Provides-Extra: pydantic
25
+ Requires-Dist: pydantic>=2.0; extra == "pydantic"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8; extra == "dev"
28
+
29
+ # Project description
30
+
31
+ Project description will be here
@@ -0,0 +1,144 @@
1
+ """data_transfer_lib v2 — public façade.
2
+
3
+ The new architecture (see ``architecture/`` at the repo root) is being landed
4
+ in phases. Phase 1 ships the skeleton: contracts, registries, lifecycle bus.
5
+ Concrete strategies (Postgres, ClickHouse, Kafka, …) ship in Phase 3+.
6
+ """
7
+
8
+ from data_transfer_lib._bootstrap import _load_builtins, _load_entry_points
9
+
10
+ _load_builtins()
11
+ _load_entry_points()
12
+
13
+ # Public façade -------------------------------------------------------------
14
+
15
+ from data_transfer_lib.api.reader import Reader
16
+ from data_transfer_lib.api.writer import Writer
17
+ from data_transfer_lib.api.engine_pipeline import EnginePipeline
18
+ from data_transfer_lib.connections import ClickHouse, Kafka, Postgres
19
+ from data_transfer_lib.core.connection import Connection
20
+ from data_transfer_lib.extensions.logging import LoggingExtension
21
+ from data_transfer_lib.core.io import (
22
+ Mode,
23
+ ReadOptions,
24
+ Source,
25
+ Target,
26
+ WriteOptions,
27
+ )
28
+ from data_transfer_lib.core.lifecycle import Extension, hook
29
+ from data_transfer_lib.core.exceptions import (
30
+ AuthenticationError,
31
+ BatchReadError,
32
+ BatchWriteError,
33
+ ConfigurationError,
34
+ ConnectionError,
35
+ ConnectionRefusedError,
36
+ DataTransferError,
37
+ DDLExecutionError,
38
+ DriverError,
39
+ ExtensionError,
40
+ ExtraColumnError,
41
+ HookError,
42
+ IncompatibleTypeError,
43
+ InvalidOptionError,
44
+ MissingColumnError,
45
+ NativePipelineError,
46
+ NullabilityError,
47
+ PipelineDeployError,
48
+ PipelineTeardownError,
49
+ PluginRegistrationError,
50
+ ReadError,
51
+ SchemaError,
52
+ SchemaIntrospectionError,
53
+ SchemaValidationError,
54
+ StreamingReadError,
55
+ StreamingWriteError,
56
+ TimeoutError,
57
+ TypeMappingError,
58
+ UnsupportedModeError,
59
+ WriteError,
60
+ WriteModeConflictError,
61
+ ErrorContext,
62
+ # Deprecation aliases (removed in 2.2)
63
+ ConnectionException,
64
+ DataTransferException,
65
+ SchemaValidationException,
66
+ TypeMappingException,
67
+ )
68
+ from data_transfer_lib.registry import (
69
+ register_engine_pipeline,
70
+ register_extension,
71
+ register_source,
72
+ register_target,
73
+ register_type_mapper,
74
+ unregister_extension,
75
+ )
76
+
77
+ __version__ = "2.0.0"
78
+
79
+ __all__ = [
80
+ "__version__",
81
+ # API
82
+ "Reader",
83
+ "Writer",
84
+ "EnginePipeline",
85
+ "Mode",
86
+ "ReadOptions",
87
+ "WriteOptions",
88
+ "Source",
89
+ "Target",
90
+ "Connection",
91
+ # Connections
92
+ "Postgres",
93
+ "ClickHouse",
94
+ "Kafka",
95
+ # Lifecycle
96
+ "Extension",
97
+ "hook",
98
+ "LoggingExtension",
99
+ # Registration
100
+ "register_source",
101
+ "register_target",
102
+ "register_engine_pipeline",
103
+ "register_type_mapper",
104
+ "register_extension",
105
+ "unregister_extension",
106
+ # Exceptions
107
+ "DataTransferError",
108
+ "ErrorContext",
109
+ "ConfigurationError",
110
+ "UnsupportedModeError",
111
+ "InvalidOptionError",
112
+ "PluginRegistrationError",
113
+ "ConnectionError",
114
+ "ConnectionRefusedError",
115
+ "AuthenticationError",
116
+ "DriverError",
117
+ "TimeoutError",
118
+ "SchemaError",
119
+ "SchemaIntrospectionError",
120
+ "SchemaValidationError",
121
+ "MissingColumnError",
122
+ "ExtraColumnError",
123
+ "IncompatibleTypeError",
124
+ "NullabilityError",
125
+ "TypeMappingError",
126
+ "ReadError",
127
+ "BatchReadError",
128
+ "StreamingReadError",
129
+ "WriteError",
130
+ "BatchWriteError",
131
+ "StreamingWriteError",
132
+ "WriteModeConflictError",
133
+ "NativePipelineError",
134
+ "DDLExecutionError",
135
+ "PipelineDeployError",
136
+ "PipelineTeardownError",
137
+ "ExtensionError",
138
+ "HookError",
139
+ # Deprecated
140
+ "DataTransferException",
141
+ "ConnectionException",
142
+ "SchemaValidationException",
143
+ "TypeMappingException",
144
+ ]
@@ -0,0 +1,63 @@
1
+ """One-shot bootstrap: imports in-tree strategy modules (registering them
2
+ as a side effect), wires the default ``LoggingExtension``, and walks the
3
+ ``data_transfer_lib.connectors`` entry-points group for third-party
4
+ connectors.
5
+
6
+ Phase 3+: ``strategies/__init__.py`` imports each batch/streaming/engine
7
+ module so their bottom-of-file ``register_*`` calls fire on first import
8
+ of the package.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from data_transfer_lib.core.exceptions import PluginRegistrationError
14
+ from data_transfer_lib.core.logging import get_logger
15
+
16
+ _log = get_logger(__name__)
17
+
18
+
19
+ def _load_builtins() -> None:
20
+ """Import in-tree strategy modules to trigger their registration, then
21
+ install the default ``LoggingExtension`` if it isn't already on the bus."""
22
+ try:
23
+ import data_transfer_lib.strategies # noqa: F401
24
+ except ImportError as exc: # pragma: no cover - dev-only
25
+ _log.debug("strategies package import failed: %s", exc)
26
+
27
+ from data_transfer_lib.extensions.logging import LoggingExtension
28
+ from data_transfer_lib.registry import HOOK_BUS, register_extension
29
+
30
+ default = LoggingExtension.default()
31
+ if default not in HOOK_BUS.extensions:
32
+ register_extension(default)
33
+
34
+
35
+ def _load_entry_points() -> None:
36
+ """Walk the ``data_transfer_lib.connectors`` entry-points group."""
37
+ try:
38
+ from importlib.metadata import entry_points
39
+ except ImportError: # pragma: no cover - py < 3.10 unsupported
40
+ return
41
+
42
+ try:
43
+ eps = entry_points(group="data_transfer_lib.connectors")
44
+ except TypeError: # pragma: no cover - very old Python
45
+ eps = entry_points().get("data_transfer_lib.connectors", []) # type: ignore[union-attr]
46
+
47
+ for ep in eps:
48
+ try:
49
+ register = ep.load()
50
+ except Exception as exc: # noqa: BLE001
51
+ _log.exception("failed to load entry point %s", ep)
52
+ raise PluginRegistrationError(
53
+ f"failed to load entry point {ep.name!r}"
54
+ ) from exc
55
+ try:
56
+ register()
57
+ except Exception as exc: # noqa: BLE001
58
+ raise PluginRegistrationError(
59
+ f"entry point {ep.name!r} raised during register()"
60
+ ) from exc
61
+
62
+
63
+ __all__ = ["_load_builtins", "_load_entry_points"]
@@ -0,0 +1,7 @@
1
+ """Public façade (L6): Reader, Writer, EnginePipeline."""
2
+
3
+ from data_transfer_lib.api.reader import Reader
4
+ from data_transfer_lib.api.writer import Writer
5
+ from data_transfer_lib.api.engine_pipeline import EnginePipeline
6
+
7
+ __all__ = ["Reader", "Writer", "EnginePipeline"]
@@ -0,0 +1,57 @@
1
+ """User-facing wrapper around concrete ``EnginePipeline`` strategies.
2
+
3
+ The base class enumerates **no** specific kinds. Adding a new pipeline is a
4
+ new file under ``strategies/engine/`` that registers itself via
5
+ ``register_engine_pipeline(...)``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Optional
11
+
12
+ from data_transfer_lib.core.connection import Connection
13
+ from data_transfer_lib.core.io import EnginePipeline as _EnginePipelineBase
14
+ from data_transfer_lib.core.lifecycle import (
15
+ ON_ENGINE_DEPLOY,
16
+ ON_ENGINE_TEARDOWN,
17
+ )
18
+ from data_transfer_lib.registry import ENGINE_PIPELINE_REGISTRY, HOOK_BUS
19
+
20
+
21
+ class EnginePipeline(_EnginePipelineBase):
22
+ """User-facing engine pipeline base. Use ``EnginePipeline.create(...)``
23
+ to construct a concrete pipeline through the registry."""
24
+
25
+ @classmethod
26
+ def create(
27
+ cls,
28
+ *,
29
+ source: Connection,
30
+ target: Connection,
31
+ kind: Optional[str] = None,
32
+ deploy_mode: str = "create_if_missing",
33
+ **options: Any,
34
+ ) -> "EnginePipeline":
35
+ impl_cls = ENGINE_PIPELINE_REGISTRY.get(
36
+ source_cls=type(source), target_cls=type(target), kind=kind
37
+ )
38
+ return impl_cls(
39
+ source=source, target=target, deploy_mode=deploy_mode, **options
40
+ )
41
+
42
+ def deploy(self) -> None: # pragma: no cover - default for subclasses
43
+ raise NotImplementedError
44
+
45
+ def teardown(self) -> None: # pragma: no cover
46
+ raise NotImplementedError
47
+
48
+ # Convenience wrappers that fire lifecycle hooks ------------------------
49
+
50
+ def _fire_deploy(self) -> None:
51
+ HOOK_BUS.fire(ON_ENGINE_DEPLOY, self)
52
+
53
+ def _fire_teardown(self) -> None:
54
+ HOOK_BUS.fire(ON_ENGINE_TEARDOWN, self)
55
+
56
+
57
+ __all__ = ["EnginePipeline"]
@@ -0,0 +1,118 @@
1
+ """Thin Reader façade — dispatches to a strategy registered in
2
+ ``SourceRegistry``."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Any, Mapping, Optional, Sequence, Union
7
+
8
+ from data_transfer_lib.core.connection import Connection
9
+ from data_transfer_lib.core.exceptions import ErrorContext, HookError
10
+ from data_transfer_lib.core.io import Mode, ReadOptions
11
+ from data_transfer_lib.core.lifecycle import (
12
+ AFTER_READ,
13
+ BEFORE_READ,
14
+ ON_ERROR,
15
+ Extension,
16
+ LifecycleHookBus,
17
+ )
18
+ from data_transfer_lib.registry import HOOK_BUS, SOURCE_REGISTRY
19
+
20
+
21
+ class Reader:
22
+ """Read a source via the strategy registered for ``(connection, mode)``.
23
+
24
+ Construction validates the (connection, mode) combination via the
25
+ registry. Schema introspection and the actual read are deferred to
26
+ ``read()`` so constructors stay cheap.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ connection: Connection,
32
+ source: str,
33
+ *,
34
+ mode: Union[Mode, str] = Mode.BATCH,
35
+ options: Union[ReadOptions, Mapping[str, Any], None] = None,
36
+ validate_schema: bool = True,
37
+ extensions: Optional[Sequence[Extension]] = None,
38
+ ) -> None:
39
+ self.connection = connection
40
+ self.source = source
41
+ self.mode = Mode.coerce(mode)
42
+ self.options = ReadOptions.coerce(options)
43
+ self.validate_schema = validate_schema
44
+ self._strategy_cls = SOURCE_REGISTRY.get(connection, self.mode)
45
+ self._local_bus = LifecycleHookBus()
46
+ for ext in extensions or ():
47
+ self._local_bus.register(ext)
48
+ self._strategy: Any = None
49
+ self._source_schema: Any = None
50
+
51
+ def introspect(self) -> Any:
52
+ """Return the canonical source schema. Cached after first call."""
53
+ if self._source_schema is None:
54
+ strategy = self._ensure_strategy()
55
+ self._source_schema = strategy.introspect()
56
+ if self.options.type_overrides:
57
+ self._source_schema = _apply_type_overrides(
58
+ self._source_schema, self.options.type_overrides
59
+ )
60
+ return self._source_schema
61
+
62
+ def read(self) -> Any:
63
+ strategy = self._ensure_strategy()
64
+ if self.validate_schema:
65
+ # Surfaces SchemaIntrospectionError eagerly per architecture/04 Ex.8.
66
+ self.introspect()
67
+ try:
68
+ HOOK_BUS.fire(BEFORE_READ, self)
69
+ self._local_bus.fire(BEFORE_READ, self)
70
+ df = strategy.read()
71
+ except HookError:
72
+ raise
73
+ except Exception as exc:
74
+ self._fire_on_error(exc)
75
+ raise
76
+ HOOK_BUS.fire(AFTER_READ, self, df)
77
+ self._local_bus.fire(AFTER_READ, self, df)
78
+ return df
79
+
80
+ def _ensure_strategy(self) -> Any:
81
+ if self._strategy is None:
82
+ self._strategy = self._strategy_cls(
83
+ connection=self.connection,
84
+ source=self.source,
85
+ options=self.options,
86
+ )
87
+ return self._strategy
88
+
89
+ def _fire_on_error(self, exc: BaseException) -> None:
90
+ ctx = ErrorContext(
91
+ connection=type(self.connection).__name__,
92
+ source=self.source,
93
+ mode=self.mode.value,
94
+ )
95
+ try:
96
+ HOOK_BUS.fire(ON_ERROR, self, exc, ctx)
97
+ self._local_bus.fire(ON_ERROR, self, exc, ctx)
98
+ except Exception: # noqa: BLE001 - on_error swallows itself
99
+ pass
100
+
101
+
102
+ def _apply_type_overrides(schema: Any, overrides: Mapping[str, Any]) -> Any:
103
+ from data_transfer_lib.schema.canonical import CanonicalField, CanonicalSchema
104
+
105
+ return CanonicalSchema(
106
+ fields=tuple(
107
+ CanonicalField(
108
+ name=f.name,
109
+ type=overrides.get(f.name, f.type),
110
+ nullable=f.nullable,
111
+ description=f.description,
112
+ )
113
+ for f in schema.fields
114
+ )
115
+ )
116
+
117
+
118
+ __all__ = ["Reader"]
@@ -0,0 +1,132 @@
1
+ """Thin Writer façade — dispatches to a strategy registered in
2
+ ``TargetRegistry``."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Any, Mapping, Optional, Sequence, Union
7
+
8
+ from data_transfer_lib.core.connection import Connection
9
+ from data_transfer_lib.core.exceptions import (
10
+ ErrorContext,
11
+ HookError,
12
+ SchemaIntrospectionError,
13
+ )
14
+ from data_transfer_lib.core.io import Mode, WriteOptions
15
+ from data_transfer_lib.core.lifecycle import (
16
+ AFTER_WRITE,
17
+ BEFORE_WRITE,
18
+ ON_ERROR,
19
+ Extension,
20
+ LifecycleHookBus,
21
+ )
22
+ from data_transfer_lib.registry import HOOK_BUS, TARGET_REGISTRY
23
+
24
+ WriteMode = str # one of: append, overwrite, create_if_not_exists, upsert
25
+
26
+
27
+ class Writer:
28
+ """Write to a target via the strategy registered for ``(connection, mode)``."""
29
+
30
+ def __init__(
31
+ self,
32
+ connection: Connection,
33
+ target: str,
34
+ *,
35
+ mode: Union[Mode, str] = Mode.BATCH,
36
+ write_mode: WriteMode = "append",
37
+ options: Union[WriteOptions, Mapping[str, Any], None] = None,
38
+ validate_schema: bool = True,
39
+ extensions: Optional[Sequence[Extension]] = None,
40
+ ) -> None:
41
+ self.connection = connection
42
+ self.target = target
43
+ self.mode = Mode.coerce(mode)
44
+ self.write_mode = write_mode
45
+ self.options = WriteOptions.coerce(options)
46
+ self.validate_schema = validate_schema
47
+ self._strategy_cls = TARGET_REGISTRY.get(connection, self.mode)
48
+ self._local_bus = LifecycleHookBus()
49
+ for ext in extensions or ():
50
+ self._local_bus.register(ext)
51
+ self._strategy: Any = None
52
+
53
+ def write(self, df: Any) -> Any:
54
+ strategy = self._strategy_cls(
55
+ connection=self.connection,
56
+ target=self.target,
57
+ write_mode=self.write_mode,
58
+ options=self.options,
59
+ )
60
+ self._strategy = strategy
61
+ if self.validate_schema:
62
+ self._validate(df, strategy)
63
+ try:
64
+ HOOK_BUS.fire(BEFORE_WRITE, self, df)
65
+ self._local_bus.fire(BEFORE_WRITE, self, df)
66
+ result = strategy.write(df)
67
+ except HookError:
68
+ raise
69
+ except Exception as exc:
70
+ self._fire_on_error(exc)
71
+ raise
72
+ HOOK_BUS.fire(AFTER_WRITE, self, result)
73
+ self._local_bus.fire(AFTER_WRITE, self, result)
74
+ return result
75
+
76
+ def _validate(self, df: Any, strategy: Any) -> None:
77
+ from data_transfer_lib.schema.mappers.spark import SparkTypeMapper
78
+ from data_transfer_lib.schema.validator import SchemaValidator
79
+
80
+ df_schema = SparkTypeMapper().schema_to_canonical(df.schema)
81
+ if self.options.type_overrides:
82
+ df_schema = _apply_type_overrides(df_schema, self.options.type_overrides)
83
+
84
+ # ``create_if_not_exists`` skips target validation when the table
85
+ # doesn't exist yet — the strategy creates it from df_schema.
86
+ # ``overwrite`` also skips validation on a missing table and lets
87
+ # the strategy raise WriteModeConflictError.
88
+ try:
89
+ target_schema = strategy.introspect()
90
+ except SchemaIntrospectionError:
91
+ if self.write_mode in ("create_if_not_exists", "overwrite"):
92
+ return
93
+ raise
94
+
95
+ report = SchemaValidator.validate(df_schema, target_schema, mode="loose")
96
+ # For append/overwrite the DB engine enforces nullability at write time.
97
+ # Spark JDBC reports all columns as nullable regardless of source
98
+ # constraints, so suppressing these errors avoids false positives.
99
+ if self.write_mode in ("append", "overwrite"):
100
+ report.errors = [e for e in report.errors if e.code != "nullable"]
101
+ report.raise_if_errors()
102
+
103
+ def _fire_on_error(self, exc: BaseException) -> None:
104
+ ctx = ErrorContext(
105
+ connection=type(self.connection).__name__,
106
+ source=self.target,
107
+ mode=self.mode.value,
108
+ )
109
+ try:
110
+ HOOK_BUS.fire(ON_ERROR, self, exc, ctx)
111
+ self._local_bus.fire(ON_ERROR, self, exc, ctx)
112
+ except Exception: # noqa: BLE001
113
+ pass
114
+
115
+
116
+ def _apply_type_overrides(schema: Any, overrides: Mapping[str, Any]) -> Any:
117
+ from data_transfer_lib.schema.canonical import CanonicalField, CanonicalSchema
118
+
119
+ return CanonicalSchema(
120
+ fields=tuple(
121
+ CanonicalField(
122
+ name=f.name,
123
+ type=overrides.get(f.name, f.type),
124
+ nullable=f.nullable,
125
+ description=f.description,
126
+ )
127
+ for f in schema.fields
128
+ )
129
+ )
130
+
131
+
132
+ __all__ = ["Writer"]
@@ -0,0 +1,7 @@
1
+ """Per-system Connection implementations."""
2
+
3
+ from data_transfer_lib.connections.clickhouse import ClickHouse
4
+ from data_transfer_lib.connections.kafka import Kafka
5
+ from data_transfer_lib.connections.postgres import Postgres
6
+
7
+ __all__ = ["Postgres", "ClickHouse", "Kafka"]
@@ -0,0 +1,78 @@
1
+ """Slim ClickHouse connection (v2).
2
+
3
+ Holds the credentials, JDBC URL builder, and an optional Spark session.
4
+ Schema introspection lives in ``schema/inspectors/clickhouse_jdbc.py``;
5
+ write logic lives in ``strategies/batch/clickhouse_jdbc.py``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, Mapping, Optional
11
+
12
+ from data_transfer_lib.core.connection import Connection
13
+ from data_transfer_lib.core.exceptions import ConnectionError, ErrorContext
14
+
15
+
16
+ JDBC_DRIVER = "com.clickhouse.jdbc.ClickHouseDriver"
17
+
18
+
19
+ class ClickHouse(Connection):
20
+ def __init__(
21
+ self,
22
+ host: str,
23
+ user: str,
24
+ password: str,
25
+ database: str = "default",
26
+ port: int = 8123,
27
+ spark: Optional[Any] = None,
28
+ extra: Optional[Mapping[str, str]] = None,
29
+ ) -> None:
30
+ self.host = host
31
+ self.port = port
32
+ self.user = user
33
+ self.password = password
34
+ self.database = database
35
+ self.spark = spark
36
+ self.extra: Dict[str, str] = dict(extra or {})
37
+
38
+ def jdbc_url(self) -> str:
39
+ return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}"
40
+
41
+ def jdbc_properties(self) -> Dict[str, str]:
42
+ props: Dict[str, str] = {
43
+ "user": self.user,
44
+ "password": self.password,
45
+ "driver": JDBC_DRIVER,
46
+ }
47
+ props.update(self.extra)
48
+ return props
49
+
50
+ def test_connection(self) -> bool:
51
+ if self.spark is None:
52
+ raise ConnectionError(
53
+ "ClickHouse.test_connection requires a SparkSession",
54
+ context=ErrorContext(connection="ClickHouse"),
55
+ )
56
+ try:
57
+ (
58
+ self.spark.read.format("jdbc")
59
+ .option("url", self.jdbc_url())
60
+ .option("query", "SELECT 1")
61
+ .option("user", self.user)
62
+ .option("password", self.password)
63
+ .option("driver", JDBC_DRIVER)
64
+ .load()
65
+ .collect()
66
+ )
67
+ except Exception as exc: # noqa: BLE001
68
+ raise ConnectionError(
69
+ f"ClickHouse connection check failed: {exc}",
70
+ context=ErrorContext(connection="ClickHouse"),
71
+ ) from exc
72
+ return True
73
+
74
+ def __repr__(self) -> str: # pragma: no cover - cosmetic
75
+ return f"ClickHouse(host={self.host!r}, database={self.database!r})"
76
+
77
+
78
+ __all__ = ["ClickHouse", "JDBC_DRIVER"]