killuhub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. killuhub/__init__.py +74 -0
  2. killuhub/connectors/__init__.py +18 -0
  3. killuhub/connectors/kafka/__init__.py +3 -0
  4. killuhub/connectors/kafka/connector.py +88 -0
  5. killuhub/connectors/mysql/__init__.py +3 -0
  6. killuhub/connectors/mysql/connector.py +86 -0
  7. killuhub/connectors/postgres/__init__.py +3 -0
  8. killuhub/connectors/postgres/connector.py +59 -0
  9. killuhub/connectors/rest_api/__init__.py +3 -0
  10. killuhub/connectors/rest_api/connector.py +167 -0
  11. killuhub/core/__init__.py +36 -0
  12. killuhub/core/batch.py +135 -0
  13. killuhub/core/config.py +38 -0
  14. killuhub/core/connector_interface.py +45 -0
  15. killuhub/core/contract.py +379 -0
  16. killuhub/core/engine_interface.py +37 -0
  17. killuhub/core/environment.py +82 -0
  18. killuhub/core/exceptions.py +40 -0
  19. killuhub/core/registry.py +70 -0
  20. killuhub/core/storage_interface.py +30 -0
  21. killuhub/ingestion/__init__.py +4 -0
  22. killuhub/ingestion/pipeline.py +141 -0
  23. killuhub/ingestion/scheduler.py +155 -0
  24. killuhub/layers/__init__.py +9 -0
  25. killuhub/layers/bronze/__init__.py +3 -0
  26. killuhub/layers/bronze/pipeline.py +206 -0
  27. killuhub/layers/silver/__init__.py +34 -0
  28. killuhub/layers/silver/pipeline.py +373 -0
  29. killuhub/layers/silver/state.py +239 -0
  30. killuhub/layers/silver/transformations.py +259 -0
  31. killuhub/layers/streaming/__init__.py +3 -0
  32. killuhub/layers/streaming/pipeline.py +236 -0
  33. killuhub/processing/__init__.py +8 -0
  34. killuhub/processing/flink_engine.py +84 -0
  35. killuhub/processing/spark_engine.py +236 -0
  36. killuhub/storage/__init__.py +8 -0
  37. killuhub/storage/delta/__init__.py +3 -0
  38. killuhub/storage/delta/writer.py +57 -0
  39. killuhub/storage/hudi/__init__.py +3 -0
  40. killuhub/storage/hudi/writer.py +47 -0
  41. killuhub/storage/iceberg/__init__.py +4 -0
  42. killuhub/storage/iceberg/schema_manager.py +112 -0
  43. killuhub/storage/iceberg/writer.py +110 -0
  44. killuhub-0.1.0.dist-info/METADATA +520 -0
  45. killuhub-0.1.0.dist-info/RECORD +47 -0
  46. killuhub-0.1.0.dist-info/WHEEL +5 -0
  47. killuhub-0.1.0.dist-info/top_level.txt +1 -0
killuhub/__init__.py ADDED
@@ -0,0 +1,74 @@
1
+ """
2
+ KilluHub — Pluggable data ingestion framework.
3
+
4
+ Sources → Connectors → Engine (Spark/Flink) → Bronze → Silver → Iceberg
5
+
6
+ Quick start (raw pipeline):
7
+ from killuhub import Pipeline, PipelineConfig, ConnectorConfig
8
+
9
+ Pipeline(PipelineConfig(
10
+ connector_name="postgres",
11
+ connector_config=ConnectorConfig.from_dict({...}),
12
+ target_table="local.db.orders",
13
+ )).run()
14
+
15
+ Medallion (Bronze → Silver):
16
+ from killuhub import BronzePipeline, BronzeConfig, SilverPipeline, SilverConfig
17
+ from killuhub.core.batch import BatchMode, BatchConfig
18
+
19
+ BronzePipeline(BronzeConfig(
20
+ pipeline_config=PipelineConfig(...),
21
+ batch_config=BatchConfig(mode=BatchMode.FULL),
22
+ bronze_table="local.bronze.orders",
23
+ source_name="postgres.shop.orders",
24
+ )).run()
25
+
26
+ SilverPipeline(SilverConfig(
27
+ bronze_table="local.bronze.orders",
28
+ silver_table="local.silver.orders",
29
+ batch_config=BatchConfig(mode=BatchMode.INCREMENTAL),
30
+ key_columns=["order_id"],
31
+ date_columns=["created_at"],
32
+ )).run()
33
+ """
34
+ # Import subpackages so auto-registration runs
35
+ import killuhub.connectors # noqa: F401
36
+ import killuhub.processing # noqa: F401
37
+
38
+ from killuhub.core import (
39
+ BaseConnector,
40
+ BaseEngine,
41
+ BaseStorageWriter,
42
+ ConnectorConfig,
43
+ PipelineConfig,
44
+ BatchMode,
45
+ BatchConfig,
46
+ IncrementalState,
47
+ default_registry,
48
+ )
49
+ from killuhub.ingestion import Pipeline, PipelineScheduler
50
+ from killuhub.layers import BronzePipeline, BronzeConfig, SilverPipeline, SilverConfig
51
+
52
+ __version__ = "0.1.0"
53
+
54
+ __all__ = [
55
+ # Raw pipeline
56
+ "Pipeline",
57
+ "PipelineScheduler",
58
+ "PipelineConfig",
59
+ "ConnectorConfig",
60
+ # Medallion layers
61
+ "BronzePipeline",
62
+ "BronzeConfig",
63
+ "SilverPipeline",
64
+ "SilverConfig",
65
+ # Batch primitives
66
+ "BatchMode",
67
+ "BatchConfig",
68
+ "IncrementalState",
69
+ # Extension points
70
+ "BaseConnector",
71
+ "BaseEngine",
72
+ "BaseStorageWriter",
73
+ "default_registry",
74
+ ]
@@ -0,0 +1,18 @@
1
+ from killuhub.connectors.postgres.connector import PostgresConnector
2
+ from killuhub.connectors.mysql.connector import MySQLConnector
3
+ from killuhub.connectors.kafka.connector import KafkaConnector
4
+ from killuhub.connectors.rest_api.connector import RestApiConnector
5
+ from killuhub.core.registry import default_registry
6
+
7
+ # Auto-register all built-in connectors
8
+ default_registry.register_connector("postgres", PostgresConnector)
9
+ default_registry.register_connector("mysql", MySQLConnector)
10
+ default_registry.register_connector("kafka", KafkaConnector)
11
+ default_registry.register_connector("rest_api", RestApiConnector)
12
+
13
+ __all__ = [
14
+ "PostgresConnector",
15
+ "MySQLConnector",
16
+ "KafkaConnector",
17
+ "RestApiConnector",
18
+ ]
@@ -0,0 +1,3 @@
1
+ from killuhub.connectors.kafka.connector import KafkaConnector
2
+
3
+ __all__ = ["KafkaConnector"]
@@ -0,0 +1,88 @@
1
+ """
2
+ Kafka connector for KilluHub.
3
+
4
+ Required config keys:
5
+ bootstrap_servers — comma-separated broker list
6
+ topic — topic to consume
7
+ group_id — consumer group id
8
+
9
+ Optional:
10
+ auto_offset_reset (default "earliest")
11
+ max_records (int) — stop after N records (useful for batch jobs)
12
+ poll_timeout_ms (int, default 1000)
13
+ value_deserializer ("json" | "string", default "json")
14
+ """
15
+ import json
16
+ from typing import Any, Iterator
17
+
18
+ from killuhub.core.connector_interface import BaseConnector
19
+ from killuhub.core.config import ConnectorConfig
20
+
21
+
22
+ class KafkaConnector(BaseConnector):
23
+ def __init__(self, config: ConnectorConfig):
24
+ super().__init__(config)
25
+ self._consumer = None
26
+
27
+ def connect(self) -> None:
28
+ try:
29
+ from confluent_kafka import Consumer
30
+ except ImportError as e:
31
+ raise ImportError(
32
+ "confluent-kafka is required for KafkaConnector. "
33
+ "Install it with: pip install confluent-kafka"
34
+ ) from e
35
+
36
+ from confluent_kafka import Consumer
37
+
38
+ self._consumer = Consumer(
39
+ {
40
+ "bootstrap.servers": self.config.require("bootstrap_servers"),
41
+ "group.id": self.config.require("group_id"),
42
+ "auto.offset.reset": self.config.get("auto_offset_reset", "earliest"),
43
+ "enable.auto.commit": False,
44
+ }
45
+ )
46
+ self._consumer.subscribe([self.config.require("topic")])
47
+ self._connected = True
48
+
49
+ def extract(self) -> Iterator[dict[str, Any]]:
50
+ if not self._connected:
51
+ self.connect()
52
+
53
+ max_records = self.config.get("max_records", None)
54
+ poll_timeout = self.config.get("poll_timeout_ms", 1_000) / 1_000
55
+ deserializer = self.config.get("value_deserializer", "json")
56
+ count = 0
57
+
58
+ while True:
59
+ msg = self._consumer.poll(timeout=poll_timeout)
60
+
61
+ if msg is None:
62
+ # No message within timeout — treat as end of stream for batch mode
63
+ break
64
+
65
+ if msg.error():
66
+ from confluent_kafka import KafkaError
67
+ if msg.error().code() == KafkaError._PARTITION_EOF:
68
+ break
69
+ raise RuntimeError(f"Kafka error: {msg.error()}")
70
+
71
+ raw = msg.value()
72
+ if deserializer == "json":
73
+ record = json.loads(raw.decode("utf-8"))
74
+ else:
75
+ record = {"value": raw.decode("utf-8")}
76
+
77
+ yield record
78
+ self._consumer.commit(message=msg)
79
+ count += 1
80
+
81
+ if max_records and count >= max_records:
82
+ break
83
+
84
+ def close(self) -> None:
85
+ if self._consumer:
86
+ self._consumer.close()
87
+ self._consumer = None
88
+ self._connected = False
@@ -0,0 +1,3 @@
1
+ from killuhub.connectors.mysql.connector import MySQLConnector
2
+
3
+ __all__ = ["MySQLConnector"]
@@ -0,0 +1,86 @@
1
+ """
2
+ MySQL connector for KilluHub.
3
+
4
+ Uses mysql-connector-python with a server-side cursor (buffered=False) so large
5
+ tables are streamed row-by-row without loading the full result set into memory.
6
+
7
+ Required config keys:
8
+ host, database, user, password, query
9
+
10
+ Optional:
11
+ port (int, default 3306)
12
+ batch_size (int, default 1000) — rows fetched per round-trip
13
+ ssl_ca (str) — path to CA certificate for TLS connections
14
+ ssl_cert (str) — path to client certificate
15
+ ssl_key (str) — path to client private key
16
+ """
17
+ from typing import Any, Iterator
18
+
19
+ from killuhub.core.connector_interface import BaseConnector
20
+ from killuhub.core.config import ConnectorConfig
21
+
22
+
23
+ class MySQLConnector(BaseConnector):
24
+ def __init__(self, config: ConnectorConfig):
25
+ super().__init__(config)
26
+ self._conn = None
27
+
28
+ def connect(self) -> None:
29
+ try:
30
+ import mysql.connector
31
+ except ImportError as e:
32
+ raise ImportError(
33
+ "mysql-connector-python is required for MySQLConnector. "
34
+ "Install it with: pip install mysql-connector-python"
35
+ ) from e
36
+
37
+ kwargs: dict[str, Any] = {
38
+ "host": self.config.require("host"),
39
+ "port": self.config.get("port", 3306),
40
+ "database": self.config.require("database"),
41
+ "user": self.config.require("user"),
42
+ "password": self.config.require("password"),
43
+ # consume_results=True allows re-use of the connection after
44
+ # the cursor is fully iterated without explicitly fetching all rows
45
+ "consume_results": True,
46
+ }
47
+
48
+ ssl_ca = self.config.get("ssl_ca")
49
+ if ssl_ca:
50
+ kwargs["ssl_ca"] = ssl_ca
51
+ kwargs["ssl_verify_cert"] = True
52
+ ssl_cert = self.config.get("ssl_cert")
53
+ if ssl_cert:
54
+ kwargs["ssl_cert"] = ssl_cert
55
+ kwargs["ssl_key"] = self.config.require("ssl_key")
56
+
57
+ import mysql.connector
58
+ self._conn = mysql.connector.connect(**kwargs)
59
+ self._connected = True
60
+
61
+ def extract(self) -> Iterator[dict[str, Any]]:
62
+ if not self._connected:
63
+ self.connect()
64
+
65
+ query = self.config.require("query")
66
+ batch_size = self.config.get("batch_size", 1_000)
67
+
68
+ # buffered=False → server-side streaming cursor, safe for large tables.
69
+ # fetchmany() pulls batch_size rows per round-trip to bound memory usage.
70
+ cursor = self._conn.cursor(buffered=False, dictionary=True)
71
+ try:
72
+ cursor.execute(query)
73
+ while True:
74
+ rows = cursor.fetchmany(batch_size)
75
+ if not rows:
76
+ break
77
+ for row in rows:
78
+ yield dict(row)
79
+ finally:
80
+ cursor.close()
81
+
82
+ def close(self) -> None:
83
+ if self._conn:
84
+ self._conn.close()
85
+ self._conn = None
86
+ self._connected = False
@@ -0,0 +1,3 @@
1
+ from killuhub.connectors.postgres.connector import PostgresConnector
2
+
3
+ __all__ = ["PostgresConnector"]
@@ -0,0 +1,59 @@
1
+ """
2
+ PostgreSQL connector for KilluHub.
3
+
4
+ Required config keys:
5
+ host, port, database, user, password, query
6
+
7
+ Optional:
8
+ batch_size (int, default 1000) — rows fetched per server-side cursor fetch
9
+ """
10
+ from typing import Any, Iterator
11
+
12
+ from killuhub.core.connector_interface import BaseConnector
13
+ from killuhub.core.config import ConnectorConfig
14
+
15
+
16
+ class PostgresConnector(BaseConnector):
17
+ def __init__(self, config: ConnectorConfig):
18
+ super().__init__(config)
19
+ self._conn = None
20
+ self._cursor = None
21
+
22
+ def connect(self) -> None:
23
+ try:
24
+ import psycopg2
25
+ except ImportError as e:
26
+ raise ImportError(
27
+ "psycopg2 is required for PostgresConnector. "
28
+ "Install it with: pip install psycopg2-binary"
29
+ ) from e
30
+
31
+ self._conn = psycopg2.connect(
32
+ host=self.config.require("host"),
33
+ port=self.config.get("port", 5432),
34
+ dbname=self.config.require("database"),
35
+ user=self.config.require("user"),
36
+ password=self.config.require("password"),
37
+ )
38
+ self._connected = True
39
+
40
+ def extract(self) -> Iterator[dict[str, Any]]:
41
+ if not self._connected:
42
+ self.connect()
43
+
44
+ query = self.config.require("query")
45
+ batch_size = self.config.get("batch_size", 1_000)
46
+
47
+ # Named cursor enables server-side pagination — safe for large tables
48
+ with self._conn.cursor(name="killuhub_cursor") as cur:
49
+ cur.itersize = batch_size
50
+ cur.execute(query)
51
+ columns = [desc[0] for desc in cur.description]
52
+ for row in cur:
53
+ yield dict(zip(columns, row))
54
+
55
+ def close(self) -> None:
56
+ if self._conn:
57
+ self._conn.close()
58
+ self._conn = None
59
+ self._connected = False
@@ -0,0 +1,3 @@
1
+ from killuhub.connectors.rest_api.connector import RestApiConnector
2
+
3
+ __all__ = ["RestApiConnector"]
@@ -0,0 +1,167 @@
1
+ """
2
+ REST API connector for KilluHub.
3
+
4
+ Handles paginated JSON APIs with configurable pagination strategies.
5
+
6
+ Required config keys:
7
+ url — base endpoint URL
8
+
9
+ Optional:
10
+ method ("GET" | "POST", default "GET")
11
+ headers (dict)
12
+ params (dict) — query string params
13
+ body (dict) — request body for POST
14
+ auth_type ("bearer" | "basic" | None)
15
+ auth_token — used with auth_type "bearer"
16
+ auth_user — used with auth_type "basic"
17
+ auth_password — used with auth_type "basic"
18
+ data_key — JSON key that holds the records list (e.g. "data", "results")
19
+ pagination ("none" | "page" | "cursor" | "offset", default "none")
20
+ page_param — query param name for page number (default "page")
21
+ page_size_param — query param name for page size (default "page_size")
22
+ page_size (int, default 100)
23
+ cursor_key — JSON key that holds next cursor (default "next_cursor")
24
+ cursor_param — query param name for cursor (default "cursor")
25
+ max_pages (int) — safety cap on pagination depth
26
+ """
27
+ from typing import Any, Iterator
28
+
29
+ from killuhub.core.connector_interface import BaseConnector
30
+ from killuhub.core.config import ConnectorConfig
31
+
32
+
33
+ class RestApiConnector(BaseConnector):
34
+ def __init__(self, config: ConnectorConfig):
35
+ super().__init__(config)
36
+ self._session = None
37
+
38
+ def connect(self) -> None:
39
+ try:
40
+ import requests
41
+ except ImportError as e:
42
+ raise ImportError(
43
+ "requests is required for RestApiConnector. "
44
+ "Install it with: pip install requests"
45
+ ) from e
46
+
47
+ import requests
48
+ self._session = requests.Session()
49
+
50
+ headers = self.config.get("headers", {})
51
+ auth_type = self.config.get("auth_type")
52
+
53
+ if auth_type == "bearer":
54
+ headers["Authorization"] = f"Bearer {self.config.require('auth_token')}"
55
+ elif auth_type == "basic":
56
+ from requests.auth import HTTPBasicAuth
57
+ self._session.auth = HTTPBasicAuth(
58
+ self.config.require("auth_user"),
59
+ self.config.require("auth_password"),
60
+ )
61
+
62
+ self._session.headers.update(headers)
63
+ self._connected = True
64
+
65
+ def extract(self) -> Iterator[dict[str, Any]]:
66
+ if not self._connected:
67
+ self.connect()
68
+
69
+ pagination = self.config.get("pagination", "none")
70
+
71
+ if pagination == "page":
72
+ yield from self._paginate_by_page()
73
+ elif pagination == "cursor":
74
+ yield from self._paginate_by_cursor()
75
+ elif pagination == "offset":
76
+ yield from self._paginate_by_offset()
77
+ else:
78
+ yield from self._fetch_single()
79
+
80
+ def _request(self, params: dict | None = None) -> Any:
81
+ method = self.config.get("method", "GET").upper()
82
+ url = self.config.require("url")
83
+ base_params = dict(self.config.get("params") or {})
84
+ if params:
85
+ base_params.update(params)
86
+
87
+ if method == "GET":
88
+ resp = self._session.get(url, params=base_params)
89
+ else:
90
+ resp = self._session.post(
91
+ url, params=base_params, json=self.config.get("body")
92
+ )
93
+ resp.raise_for_status()
94
+ return resp.json()
95
+
96
+ def _extract_records(self, data: Any) -> list[dict[str, Any]]:
97
+ data_key = self.config.get("data_key")
98
+ if data_key:
99
+ return data.get(data_key, [])
100
+ if isinstance(data, list):
101
+ return data
102
+ return [data]
103
+
104
+ def _fetch_single(self) -> Iterator[dict[str, Any]]:
105
+ yield from self._extract_records(self._request())
106
+
107
+ def _paginate_by_page(self) -> Iterator[dict[str, Any]]:
108
+ page_param = self.config.get("page_param", "page")
109
+ size_param = self.config.get("page_size_param", "page_size")
110
+ page_size = self.config.get("page_size", 100)
111
+ max_pages = self.config.get("max_pages", None)
112
+ page = 1
113
+
114
+ while True:
115
+ data = self._request({page_param: page, size_param: page_size})
116
+ records = self._extract_records(data)
117
+ if not records:
118
+ break
119
+ yield from records
120
+ page += 1
121
+ if max_pages and page > max_pages:
122
+ break
123
+
124
+ def _paginate_by_cursor(self) -> Iterator[dict[str, Any]]:
125
+ cursor_key = self.config.get("cursor_key", "next_cursor")
126
+ cursor_param = self.config.get("cursor_param", "cursor")
127
+ max_pages = self.config.get("max_pages", None)
128
+ cursor = None
129
+ page = 0
130
+
131
+ while True:
132
+ params = {cursor_param: cursor} if cursor else {}
133
+ data = self._request(params)
134
+ records = self._extract_records(data)
135
+ if not records:
136
+ break
137
+ yield from records
138
+ cursor = data.get(cursor_key)
139
+ if not cursor:
140
+ break
141
+ page += 1
142
+ if max_pages and page >= max_pages:
143
+ break
144
+
145
+ def _paginate_by_offset(self) -> Iterator[dict[str, Any]]:
146
+ size_param = self.config.get("page_size_param", "limit")
147
+ page_size = self.config.get("page_size", 100)
148
+ max_pages = self.config.get("max_pages", None)
149
+ offset = 0
150
+ page = 0
151
+
152
+ while True:
153
+ data = self._request({"offset": offset, size_param: page_size})
154
+ records = self._extract_records(data)
155
+ if not records:
156
+ break
157
+ yield from records
158
+ offset += len(records)
159
+ page += 1
160
+ if max_pages and page >= max_pages:
161
+ break
162
+
163
+ def close(self) -> None:
164
+ if self._session:
165
+ self._session.close()
166
+ self._session = None
167
+ self._connected = False
@@ -0,0 +1,36 @@
1
+ from killuhub.core.connector_interface import BaseConnector
2
+ from killuhub.core.engine_interface import BaseEngine
3
+ from killuhub.core.storage_interface import BaseStorageWriter
4
+ from killuhub.core.config import ConnectorConfig, PipelineConfig
5
+ from killuhub.core.registry import Registry, default_registry
6
+ from killuhub.core.exceptions import (
7
+ KilluHubError, BatchModeError, IncrementalStateError, ContractViolationError
8
+ )
9
+ from killuhub.core.batch import ExecutionMode, BatchMode, BatchConfig, StreamingConfig, IncrementalState
10
+ from killuhub.core.contract import ContractSpec, ColumnSpec, ContractValidator, ContractReport
11
+ from killuhub.core.environment import RuntimeEnvironment, detect as detect_environment
12
+
13
+ __all__ = [
14
+ "BaseConnector",
15
+ "BaseEngine",
16
+ "BaseStorageWriter",
17
+ "ConnectorConfig",
18
+ "PipelineConfig",
19
+ "Registry",
20
+ "default_registry",
21
+ "KilluHubError",
22
+ "BatchModeError",
23
+ "IncrementalStateError",
24
+ "ContractViolationError",
25
+ "ExecutionMode",
26
+ "BatchMode",
27
+ "BatchConfig",
28
+ "StreamingConfig",
29
+ "IncrementalState",
30
+ "ContractSpec",
31
+ "ColumnSpec",
32
+ "ContractValidator",
33
+ "ContractReport",
34
+ "RuntimeEnvironment",
35
+ "detect_environment",
36
+ ]
killuhub/core/batch.py ADDED
@@ -0,0 +1,135 @@
1
+ """
2
+ Batch and streaming processing primitives for KilluHub.
3
+
4
+ ExecutionMode controls whether a pipeline runs as a one-shot batch job or as a
5
+ continuous stream. Within batch mode, BatchMode controls whether all records are
6
+ processed (FULL) or only records added since the last run (INCREMENTAL).
7
+
8
+ These are core-layer types — no Spark, no I/O, no external dependencies.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+
15
+
16
+ class ExecutionMode(str, Enum):
17
+ """
18
+ Top-level pipeline execution mode.
19
+
20
+ BATCH — one-shot job: read, transform, write, exit.
21
+ STREAMING — continuous job: run indefinitely, processing records as they
22
+ arrive (Spark Structured Streaming or Flink).
23
+ """
24
+ BATCH = "batch"
25
+ STREAMING = "streaming"
26
+
27
+
28
+ class BatchMode(str, Enum):
29
+ """
30
+ Strategy for a BATCH execution run.
31
+
32
+ Using `str` as a mixin base means `BatchMode.FULL == "full"` is True
33
+ and the value serialises to a plain string in JSON state files without
34
+ a custom encoder.
35
+
36
+ FULL — read/process all available records.
37
+ INCREMENTAL — read only records newer than the last watermark cursor.
38
+ """
39
+ FULL = "full"
40
+ INCREMENTAL = "incremental"
41
+
42
+
43
+ @dataclass
44
+ class BatchConfig:
45
+ """
46
+ Batch execution parameters — only used when ExecutionMode is BATCH.
47
+
48
+ Attributes:
49
+ mode:
50
+ FULL — read/process all available records.
51
+ INCREMENTAL — read only records newer than the last watermark.
52
+
53
+ watermark_column:
54
+ Column used to advance the incremental cursor.
55
+ For Bronze this is typically the source's own timestamp column
56
+ (e.g. `updated_at`).
57
+ For Silver this is typically `_ingested_at` (the Bronze metadata
58
+ column) so Silver can filter on a column it controls regardless
59
+ of the source schema.
60
+
61
+ initial_watermark:
62
+ ISO-8601 string used as the lower bound on the very first
63
+ INCREMENTAL run when no prior state exists.
64
+ Default epoch means "process everything on first run".
65
+
66
+ batch_id:
67
+ Unique identifier for this run, written to the `_batch_id`
68
+ metadata column in Bronze. If empty, Bronze auto-generates a UUID
69
+ at runtime. Injecting a deterministic ID makes reruns idempotent.
70
+ """
71
+ mode: BatchMode
72
+ watermark_column: str = "updated_at"
73
+ initial_watermark: str = "1970-01-01T00:00:00"
74
+ batch_id: str = ""
75
+
76
+
77
+ @dataclass
78
+ class StreamingConfig:
79
+ """
80
+ Streaming execution parameters — only used when ExecutionMode is STREAMING.
81
+
82
+ Attributes:
83
+ trigger:
84
+ Spark Structured Streaming trigger type.
85
+ processingTime — microbatch, runs every `trigger_interval`.
86
+ once — processes all available data in one microbatch then stops.
87
+ availableNow — like once but respects rate limits.
88
+ continuous — experimental low-latency continuous processing.
89
+
90
+ trigger_interval:
91
+ Microbatch interval string (e.g. "30 seconds", "1 minute").
92
+ Only used when trigger is processingTime.
93
+
94
+ checkpoint_location:
95
+ Path where Spark stores streaming checkpoint state.
96
+ Must be durable storage (S3, HDFS, ADLS) in production.
97
+ Required for exactly-once guarantees and job restarts.
98
+
99
+ output_mode:
100
+ append — only new rows written each microbatch (default, Iceberg).
101
+ update — rows that changed are written (requires watermarking).
102
+ complete — full result written each time (for aggregations).
103
+ """
104
+ trigger: str = "processingTime"
105
+ trigger_interval: str = "30 seconds"
106
+ checkpoint_location: str = "/tmp/killuhub-checkpoints"
107
+ output_mode: str = "append"
108
+
109
+
110
+ @dataclass
111
+ class IncrementalState:
112
+ """
113
+ Persisted watermark state for an INCREMENTAL Silver run.
114
+
115
+ Each (source_table, target_table) pair has one state record.
116
+ After a successful Silver run the `last_watermark` is updated to
117
+ max(watermark_column) observed in that batch, so the next run
118
+ starts exactly from where this one ended.
119
+
120
+ Attributes:
121
+ source_table: Fully qualified bronze Iceberg table name.
122
+ target_table: Fully qualified silver Iceberg table name.
123
+ last_watermark: ISO-8601 string. Next incremental run will
124
+ filter `watermark_column > last_watermark`.
125
+ updated_at: ISO-8601 string of when this state was written.
126
+ Informational — not used for query filtering.
127
+ """
128
+ source_table: str
129
+ target_table: str
130
+ last_watermark: str
131
+ updated_at: str = field(default="")
132
+
133
+ def state_key(self) -> str:
134
+ """Stable dict key for JSON serialisation."""
135
+ return f"{self.source_table}::{self.target_table}"