killuhub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- killuhub/__init__.py +74 -0
- killuhub/connectors/__init__.py +18 -0
- killuhub/connectors/kafka/__init__.py +3 -0
- killuhub/connectors/kafka/connector.py +88 -0
- killuhub/connectors/mysql/__init__.py +3 -0
- killuhub/connectors/mysql/connector.py +86 -0
- killuhub/connectors/postgres/__init__.py +3 -0
- killuhub/connectors/postgres/connector.py +59 -0
- killuhub/connectors/rest_api/__init__.py +3 -0
- killuhub/connectors/rest_api/connector.py +167 -0
- killuhub/core/__init__.py +36 -0
- killuhub/core/batch.py +135 -0
- killuhub/core/config.py +38 -0
- killuhub/core/connector_interface.py +45 -0
- killuhub/core/contract.py +379 -0
- killuhub/core/engine_interface.py +37 -0
- killuhub/core/environment.py +82 -0
- killuhub/core/exceptions.py +40 -0
- killuhub/core/registry.py +70 -0
- killuhub/core/storage_interface.py +30 -0
- killuhub/ingestion/__init__.py +4 -0
- killuhub/ingestion/pipeline.py +141 -0
- killuhub/ingestion/scheduler.py +155 -0
- killuhub/layers/__init__.py +9 -0
- killuhub/layers/bronze/__init__.py +3 -0
- killuhub/layers/bronze/pipeline.py +206 -0
- killuhub/layers/silver/__init__.py +34 -0
- killuhub/layers/silver/pipeline.py +373 -0
- killuhub/layers/silver/state.py +239 -0
- killuhub/layers/silver/transformations.py +259 -0
- killuhub/layers/streaming/__init__.py +3 -0
- killuhub/layers/streaming/pipeline.py +236 -0
- killuhub/processing/__init__.py +8 -0
- killuhub/processing/flink_engine.py +84 -0
- killuhub/processing/spark_engine.py +236 -0
- killuhub/storage/__init__.py +8 -0
- killuhub/storage/delta/__init__.py +3 -0
- killuhub/storage/delta/writer.py +57 -0
- killuhub/storage/hudi/__init__.py +3 -0
- killuhub/storage/hudi/writer.py +47 -0
- killuhub/storage/iceberg/__init__.py +4 -0
- killuhub/storage/iceberg/schema_manager.py +112 -0
- killuhub/storage/iceberg/writer.py +110 -0
- killuhub-0.1.0.dist-info/METADATA +520 -0
- killuhub-0.1.0.dist-info/RECORD +47 -0
- killuhub-0.1.0.dist-info/WHEEL +5 -0
- killuhub-0.1.0.dist-info/top_level.txt +1 -0
killuhub/__init__.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KilluHub — Pluggable data ingestion framework.
|
|
3
|
+
|
|
4
|
+
Sources → Connectors → Engine (Spark/Flink) → Bronze → Silver → Iceberg
|
|
5
|
+
|
|
6
|
+
Quick start (raw pipeline):
|
|
7
|
+
from killuhub import Pipeline, PipelineConfig, ConnectorConfig
|
|
8
|
+
|
|
9
|
+
Pipeline(PipelineConfig(
|
|
10
|
+
connector_name="postgres",
|
|
11
|
+
connector_config=ConnectorConfig.from_dict({...}),
|
|
12
|
+
target_table="local.db.orders",
|
|
13
|
+
)).run()
|
|
14
|
+
|
|
15
|
+
Medallion (Bronze → Silver):
|
|
16
|
+
from killuhub import BronzePipeline, BronzeConfig, SilverPipeline, SilverConfig
|
|
17
|
+
from killuhub.core.batch import BatchMode, BatchConfig
|
|
18
|
+
|
|
19
|
+
BronzePipeline(BronzeConfig(
|
|
20
|
+
pipeline_config=PipelineConfig(...),
|
|
21
|
+
batch_config=BatchConfig(mode=BatchMode.FULL),
|
|
22
|
+
bronze_table="local.bronze.orders",
|
|
23
|
+
source_name="postgres.shop.orders",
|
|
24
|
+
)).run()
|
|
25
|
+
|
|
26
|
+
SilverPipeline(SilverConfig(
|
|
27
|
+
bronze_table="local.bronze.orders",
|
|
28
|
+
silver_table="local.silver.orders",
|
|
29
|
+
batch_config=BatchConfig(mode=BatchMode.INCREMENTAL),
|
|
30
|
+
key_columns=["order_id"],
|
|
31
|
+
date_columns=["created_at"],
|
|
32
|
+
)).run()
|
|
33
|
+
"""
|
|
34
|
+
# Import subpackages so auto-registration runs
|
|
35
|
+
import killuhub.connectors # noqa: F401
|
|
36
|
+
import killuhub.processing # noqa: F401
|
|
37
|
+
|
|
38
|
+
from killuhub.core import (
|
|
39
|
+
BaseConnector,
|
|
40
|
+
BaseEngine,
|
|
41
|
+
BaseStorageWriter,
|
|
42
|
+
ConnectorConfig,
|
|
43
|
+
PipelineConfig,
|
|
44
|
+
BatchMode,
|
|
45
|
+
BatchConfig,
|
|
46
|
+
IncrementalState,
|
|
47
|
+
default_registry,
|
|
48
|
+
)
|
|
49
|
+
from killuhub.ingestion import Pipeline, PipelineScheduler
|
|
50
|
+
from killuhub.layers import BronzePipeline, BronzeConfig, SilverPipeline, SilverConfig
|
|
51
|
+
|
|
52
|
+
__version__ = "0.1.0"
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
# Raw pipeline
|
|
56
|
+
"Pipeline",
|
|
57
|
+
"PipelineScheduler",
|
|
58
|
+
"PipelineConfig",
|
|
59
|
+
"ConnectorConfig",
|
|
60
|
+
# Medallion layers
|
|
61
|
+
"BronzePipeline",
|
|
62
|
+
"BronzeConfig",
|
|
63
|
+
"SilverPipeline",
|
|
64
|
+
"SilverConfig",
|
|
65
|
+
# Batch primitives
|
|
66
|
+
"BatchMode",
|
|
67
|
+
"BatchConfig",
|
|
68
|
+
"IncrementalState",
|
|
69
|
+
# Extension points
|
|
70
|
+
"BaseConnector",
|
|
71
|
+
"BaseEngine",
|
|
72
|
+
"BaseStorageWriter",
|
|
73
|
+
"default_registry",
|
|
74
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from killuhub.connectors.postgres.connector import PostgresConnector
|
|
2
|
+
from killuhub.connectors.mysql.connector import MySQLConnector
|
|
3
|
+
from killuhub.connectors.kafka.connector import KafkaConnector
|
|
4
|
+
from killuhub.connectors.rest_api.connector import RestApiConnector
|
|
5
|
+
from killuhub.core.registry import default_registry
|
|
6
|
+
|
|
7
|
+
# Auto-register all built-in connectors
|
|
8
|
+
default_registry.register_connector("postgres", PostgresConnector)
|
|
9
|
+
default_registry.register_connector("mysql", MySQLConnector)
|
|
10
|
+
default_registry.register_connector("kafka", KafkaConnector)
|
|
11
|
+
default_registry.register_connector("rest_api", RestApiConnector)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"PostgresConnector",
|
|
15
|
+
"MySQLConnector",
|
|
16
|
+
"KafkaConnector",
|
|
17
|
+
"RestApiConnector",
|
|
18
|
+
]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Kafka connector for KilluHub.
|
|
3
|
+
|
|
4
|
+
Required config keys:
|
|
5
|
+
bootstrap_servers — comma-separated broker list
|
|
6
|
+
topic — topic to consume
|
|
7
|
+
group_id — consumer group id
|
|
8
|
+
|
|
9
|
+
Optional:
|
|
10
|
+
auto_offset_reset (default "earliest")
|
|
11
|
+
max_records (int) — stop after N records (useful for batch jobs)
|
|
12
|
+
poll_timeout_ms (int, default 1000)
|
|
13
|
+
value_deserializer ("json" | "string", default "json")
|
|
14
|
+
"""
|
|
15
|
+
import json
|
|
16
|
+
from typing import Any, Iterator
|
|
17
|
+
|
|
18
|
+
from killuhub.core.connector_interface import BaseConnector
|
|
19
|
+
from killuhub.core.config import ConnectorConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class KafkaConnector(BaseConnector):
|
|
23
|
+
def __init__(self, config: ConnectorConfig):
|
|
24
|
+
super().__init__(config)
|
|
25
|
+
self._consumer = None
|
|
26
|
+
|
|
27
|
+
def connect(self) -> None:
|
|
28
|
+
try:
|
|
29
|
+
from confluent_kafka import Consumer
|
|
30
|
+
except ImportError as e:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"confluent-kafka is required for KafkaConnector. "
|
|
33
|
+
"Install it with: pip install confluent-kafka"
|
|
34
|
+
) from e
|
|
35
|
+
|
|
36
|
+
from confluent_kafka import Consumer
|
|
37
|
+
|
|
38
|
+
self._consumer = Consumer(
|
|
39
|
+
{
|
|
40
|
+
"bootstrap.servers": self.config.require("bootstrap_servers"),
|
|
41
|
+
"group.id": self.config.require("group_id"),
|
|
42
|
+
"auto.offset.reset": self.config.get("auto_offset_reset", "earliest"),
|
|
43
|
+
"enable.auto.commit": False,
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
self._consumer.subscribe([self.config.require("topic")])
|
|
47
|
+
self._connected = True
|
|
48
|
+
|
|
49
|
+
def extract(self) -> Iterator[dict[str, Any]]:
|
|
50
|
+
if not self._connected:
|
|
51
|
+
self.connect()
|
|
52
|
+
|
|
53
|
+
max_records = self.config.get("max_records", None)
|
|
54
|
+
poll_timeout = self.config.get("poll_timeout_ms", 1_000) / 1_000
|
|
55
|
+
deserializer = self.config.get("value_deserializer", "json")
|
|
56
|
+
count = 0
|
|
57
|
+
|
|
58
|
+
while True:
|
|
59
|
+
msg = self._consumer.poll(timeout=poll_timeout)
|
|
60
|
+
|
|
61
|
+
if msg is None:
|
|
62
|
+
# No message within timeout — treat as end of stream for batch mode
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if msg.error():
|
|
66
|
+
from confluent_kafka import KafkaError
|
|
67
|
+
if msg.error().code() == KafkaError._PARTITION_EOF:
|
|
68
|
+
break
|
|
69
|
+
raise RuntimeError(f"Kafka error: {msg.error()}")
|
|
70
|
+
|
|
71
|
+
raw = msg.value()
|
|
72
|
+
if deserializer == "json":
|
|
73
|
+
record = json.loads(raw.decode("utf-8"))
|
|
74
|
+
else:
|
|
75
|
+
record = {"value": raw.decode("utf-8")}
|
|
76
|
+
|
|
77
|
+
yield record
|
|
78
|
+
self._consumer.commit(message=msg)
|
|
79
|
+
count += 1
|
|
80
|
+
|
|
81
|
+
if max_records and count >= max_records:
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
def close(self) -> None:
|
|
85
|
+
if self._consumer:
|
|
86
|
+
self._consumer.close()
|
|
87
|
+
self._consumer = None
|
|
88
|
+
self._connected = False
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MySQL connector for KilluHub.
|
|
3
|
+
|
|
4
|
+
Uses mysql-connector-python with a server-side cursor (buffered=False) so large
|
|
5
|
+
tables are streamed row-by-row without loading the full result set into memory.
|
|
6
|
+
|
|
7
|
+
Required config keys:
|
|
8
|
+
host, database, user, password, query
|
|
9
|
+
|
|
10
|
+
Optional:
|
|
11
|
+
port (int, default 3306)
|
|
12
|
+
batch_size (int, default 1000) — rows fetched per round-trip
|
|
13
|
+
ssl_ca (str) — path to CA certificate for TLS connections
|
|
14
|
+
ssl_cert (str) — path to client certificate
|
|
15
|
+
ssl_key (str) — path to client private key
|
|
16
|
+
"""
|
|
17
|
+
from typing import Any, Iterator
|
|
18
|
+
|
|
19
|
+
from killuhub.core.connector_interface import BaseConnector
|
|
20
|
+
from killuhub.core.config import ConnectorConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MySQLConnector(BaseConnector):
|
|
24
|
+
def __init__(self, config: ConnectorConfig):
|
|
25
|
+
super().__init__(config)
|
|
26
|
+
self._conn = None
|
|
27
|
+
|
|
28
|
+
def connect(self) -> None:
|
|
29
|
+
try:
|
|
30
|
+
import mysql.connector
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"mysql-connector-python is required for MySQLConnector. "
|
|
34
|
+
"Install it with: pip install mysql-connector-python"
|
|
35
|
+
) from e
|
|
36
|
+
|
|
37
|
+
kwargs: dict[str, Any] = {
|
|
38
|
+
"host": self.config.require("host"),
|
|
39
|
+
"port": self.config.get("port", 3306),
|
|
40
|
+
"database": self.config.require("database"),
|
|
41
|
+
"user": self.config.require("user"),
|
|
42
|
+
"password": self.config.require("password"),
|
|
43
|
+
# consume_results=True allows re-use of the connection after
|
|
44
|
+
# the cursor is fully iterated without explicitly fetching all rows
|
|
45
|
+
"consume_results": True,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
ssl_ca = self.config.get("ssl_ca")
|
|
49
|
+
if ssl_ca:
|
|
50
|
+
kwargs["ssl_ca"] = ssl_ca
|
|
51
|
+
kwargs["ssl_verify_cert"] = True
|
|
52
|
+
ssl_cert = self.config.get("ssl_cert")
|
|
53
|
+
if ssl_cert:
|
|
54
|
+
kwargs["ssl_cert"] = ssl_cert
|
|
55
|
+
kwargs["ssl_key"] = self.config.require("ssl_key")
|
|
56
|
+
|
|
57
|
+
import mysql.connector
|
|
58
|
+
self._conn = mysql.connector.connect(**kwargs)
|
|
59
|
+
self._connected = True
|
|
60
|
+
|
|
61
|
+
def extract(self) -> Iterator[dict[str, Any]]:
|
|
62
|
+
if not self._connected:
|
|
63
|
+
self.connect()
|
|
64
|
+
|
|
65
|
+
query = self.config.require("query")
|
|
66
|
+
batch_size = self.config.get("batch_size", 1_000)
|
|
67
|
+
|
|
68
|
+
# buffered=False → server-side streaming cursor, safe for large tables.
|
|
69
|
+
# fetchmany() pulls batch_size rows per round-trip to bound memory usage.
|
|
70
|
+
cursor = self._conn.cursor(buffered=False, dictionary=True)
|
|
71
|
+
try:
|
|
72
|
+
cursor.execute(query)
|
|
73
|
+
while True:
|
|
74
|
+
rows = cursor.fetchmany(batch_size)
|
|
75
|
+
if not rows:
|
|
76
|
+
break
|
|
77
|
+
for row in rows:
|
|
78
|
+
yield dict(row)
|
|
79
|
+
finally:
|
|
80
|
+
cursor.close()
|
|
81
|
+
|
|
82
|
+
def close(self) -> None:
|
|
83
|
+
if self._conn:
|
|
84
|
+
self._conn.close()
|
|
85
|
+
self._conn = None
|
|
86
|
+
self._connected = False
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PostgreSQL connector for KilluHub.
|
|
3
|
+
|
|
4
|
+
Required config keys:
|
|
5
|
+
host, port, database, user, password, query
|
|
6
|
+
|
|
7
|
+
Optional:
|
|
8
|
+
batch_size (int, default 1000) — rows fetched per server-side cursor fetch
|
|
9
|
+
"""
|
|
10
|
+
from typing import Any, Iterator
|
|
11
|
+
|
|
12
|
+
from killuhub.core.connector_interface import BaseConnector
|
|
13
|
+
from killuhub.core.config import ConnectorConfig
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PostgresConnector(BaseConnector):
|
|
17
|
+
def __init__(self, config: ConnectorConfig):
|
|
18
|
+
super().__init__(config)
|
|
19
|
+
self._conn = None
|
|
20
|
+
self._cursor = None
|
|
21
|
+
|
|
22
|
+
def connect(self) -> None:
|
|
23
|
+
try:
|
|
24
|
+
import psycopg2
|
|
25
|
+
except ImportError as e:
|
|
26
|
+
raise ImportError(
|
|
27
|
+
"psycopg2 is required for PostgresConnector. "
|
|
28
|
+
"Install it with: pip install psycopg2-binary"
|
|
29
|
+
) from e
|
|
30
|
+
|
|
31
|
+
self._conn = psycopg2.connect(
|
|
32
|
+
host=self.config.require("host"),
|
|
33
|
+
port=self.config.get("port", 5432),
|
|
34
|
+
dbname=self.config.require("database"),
|
|
35
|
+
user=self.config.require("user"),
|
|
36
|
+
password=self.config.require("password"),
|
|
37
|
+
)
|
|
38
|
+
self._connected = True
|
|
39
|
+
|
|
40
|
+
def extract(self) -> Iterator[dict[str, Any]]:
|
|
41
|
+
if not self._connected:
|
|
42
|
+
self.connect()
|
|
43
|
+
|
|
44
|
+
query = self.config.require("query")
|
|
45
|
+
batch_size = self.config.get("batch_size", 1_000)
|
|
46
|
+
|
|
47
|
+
# Named cursor enables server-side pagination — safe for large tables
|
|
48
|
+
with self._conn.cursor(name="killuhub_cursor") as cur:
|
|
49
|
+
cur.itersize = batch_size
|
|
50
|
+
cur.execute(query)
|
|
51
|
+
columns = [desc[0] for desc in cur.description]
|
|
52
|
+
for row in cur:
|
|
53
|
+
yield dict(zip(columns, row))
|
|
54
|
+
|
|
55
|
+
def close(self) -> None:
|
|
56
|
+
if self._conn:
|
|
57
|
+
self._conn.close()
|
|
58
|
+
self._conn = None
|
|
59
|
+
self._connected = False
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
REST API connector for KilluHub.
|
|
3
|
+
|
|
4
|
+
Handles paginated JSON APIs with configurable pagination strategies.
|
|
5
|
+
|
|
6
|
+
Required config keys:
|
|
7
|
+
url — base endpoint URL
|
|
8
|
+
|
|
9
|
+
Optional:
|
|
10
|
+
method ("GET" | "POST", default "GET")
|
|
11
|
+
headers (dict)
|
|
12
|
+
params (dict) — query string params
|
|
13
|
+
body (dict) — request body for POST
|
|
14
|
+
auth_type ("bearer" | "basic" | None)
|
|
15
|
+
auth_token — used with auth_type "bearer"
|
|
16
|
+
auth_user — used with auth_type "basic"
|
|
17
|
+
auth_password — used with auth_type "basic"
|
|
18
|
+
data_key — JSON key that holds the records list (e.g. "data", "results")
|
|
19
|
+
pagination ("none" | "page" | "cursor" | "offset", default "none")
|
|
20
|
+
page_param — query param name for page number (default "page")
|
|
21
|
+
page_size_param — query param name for page size (default "page_size")
|
|
22
|
+
page_size (int, default 100)
|
|
23
|
+
cursor_key — JSON key that holds next cursor (default "next_cursor")
|
|
24
|
+
cursor_param — query param name for cursor (default "cursor")
|
|
25
|
+
max_pages (int) — safety cap on pagination depth
|
|
26
|
+
"""
|
|
27
|
+
from typing import Any, Iterator
|
|
28
|
+
|
|
29
|
+
from killuhub.core.connector_interface import BaseConnector
|
|
30
|
+
from killuhub.core.config import ConnectorConfig
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class RestApiConnector(BaseConnector):
|
|
34
|
+
def __init__(self, config: ConnectorConfig):
|
|
35
|
+
super().__init__(config)
|
|
36
|
+
self._session = None
|
|
37
|
+
|
|
38
|
+
def connect(self) -> None:
|
|
39
|
+
try:
|
|
40
|
+
import requests
|
|
41
|
+
except ImportError as e:
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"requests is required for RestApiConnector. "
|
|
44
|
+
"Install it with: pip install requests"
|
|
45
|
+
) from e
|
|
46
|
+
|
|
47
|
+
import requests
|
|
48
|
+
self._session = requests.Session()
|
|
49
|
+
|
|
50
|
+
headers = self.config.get("headers", {})
|
|
51
|
+
auth_type = self.config.get("auth_type")
|
|
52
|
+
|
|
53
|
+
if auth_type == "bearer":
|
|
54
|
+
headers["Authorization"] = f"Bearer {self.config.require('auth_token')}"
|
|
55
|
+
elif auth_type == "basic":
|
|
56
|
+
from requests.auth import HTTPBasicAuth
|
|
57
|
+
self._session.auth = HTTPBasicAuth(
|
|
58
|
+
self.config.require("auth_user"),
|
|
59
|
+
self.config.require("auth_password"),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self._session.headers.update(headers)
|
|
63
|
+
self._connected = True
|
|
64
|
+
|
|
65
|
+
def extract(self) -> Iterator[dict[str, Any]]:
|
|
66
|
+
if not self._connected:
|
|
67
|
+
self.connect()
|
|
68
|
+
|
|
69
|
+
pagination = self.config.get("pagination", "none")
|
|
70
|
+
|
|
71
|
+
if pagination == "page":
|
|
72
|
+
yield from self._paginate_by_page()
|
|
73
|
+
elif pagination == "cursor":
|
|
74
|
+
yield from self._paginate_by_cursor()
|
|
75
|
+
elif pagination == "offset":
|
|
76
|
+
yield from self._paginate_by_offset()
|
|
77
|
+
else:
|
|
78
|
+
yield from self._fetch_single()
|
|
79
|
+
|
|
80
|
+
def _request(self, params: dict | None = None) -> Any:
|
|
81
|
+
method = self.config.get("method", "GET").upper()
|
|
82
|
+
url = self.config.require("url")
|
|
83
|
+
base_params = dict(self.config.get("params") or {})
|
|
84
|
+
if params:
|
|
85
|
+
base_params.update(params)
|
|
86
|
+
|
|
87
|
+
if method == "GET":
|
|
88
|
+
resp = self._session.get(url, params=base_params)
|
|
89
|
+
else:
|
|
90
|
+
resp = self._session.post(
|
|
91
|
+
url, params=base_params, json=self.config.get("body")
|
|
92
|
+
)
|
|
93
|
+
resp.raise_for_status()
|
|
94
|
+
return resp.json()
|
|
95
|
+
|
|
96
|
+
def _extract_records(self, data: Any) -> list[dict[str, Any]]:
|
|
97
|
+
data_key = self.config.get("data_key")
|
|
98
|
+
if data_key:
|
|
99
|
+
return data.get(data_key, [])
|
|
100
|
+
if isinstance(data, list):
|
|
101
|
+
return data
|
|
102
|
+
return [data]
|
|
103
|
+
|
|
104
|
+
def _fetch_single(self) -> Iterator[dict[str, Any]]:
|
|
105
|
+
yield from self._extract_records(self._request())
|
|
106
|
+
|
|
107
|
+
def _paginate_by_page(self) -> Iterator[dict[str, Any]]:
|
|
108
|
+
page_param = self.config.get("page_param", "page")
|
|
109
|
+
size_param = self.config.get("page_size_param", "page_size")
|
|
110
|
+
page_size = self.config.get("page_size", 100)
|
|
111
|
+
max_pages = self.config.get("max_pages", None)
|
|
112
|
+
page = 1
|
|
113
|
+
|
|
114
|
+
while True:
|
|
115
|
+
data = self._request({page_param: page, size_param: page_size})
|
|
116
|
+
records = self._extract_records(data)
|
|
117
|
+
if not records:
|
|
118
|
+
break
|
|
119
|
+
yield from records
|
|
120
|
+
page += 1
|
|
121
|
+
if max_pages and page > max_pages:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
def _paginate_by_cursor(self) -> Iterator[dict[str, Any]]:
|
|
125
|
+
cursor_key = self.config.get("cursor_key", "next_cursor")
|
|
126
|
+
cursor_param = self.config.get("cursor_param", "cursor")
|
|
127
|
+
max_pages = self.config.get("max_pages", None)
|
|
128
|
+
cursor = None
|
|
129
|
+
page = 0
|
|
130
|
+
|
|
131
|
+
while True:
|
|
132
|
+
params = {cursor_param: cursor} if cursor else {}
|
|
133
|
+
data = self._request(params)
|
|
134
|
+
records = self._extract_records(data)
|
|
135
|
+
if not records:
|
|
136
|
+
break
|
|
137
|
+
yield from records
|
|
138
|
+
cursor = data.get(cursor_key)
|
|
139
|
+
if not cursor:
|
|
140
|
+
break
|
|
141
|
+
page += 1
|
|
142
|
+
if max_pages and page >= max_pages:
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
def _paginate_by_offset(self) -> Iterator[dict[str, Any]]:
|
|
146
|
+
size_param = self.config.get("page_size_param", "limit")
|
|
147
|
+
page_size = self.config.get("page_size", 100)
|
|
148
|
+
max_pages = self.config.get("max_pages", None)
|
|
149
|
+
offset = 0
|
|
150
|
+
page = 0
|
|
151
|
+
|
|
152
|
+
while True:
|
|
153
|
+
data = self._request({"offset": offset, size_param: page_size})
|
|
154
|
+
records = self._extract_records(data)
|
|
155
|
+
if not records:
|
|
156
|
+
break
|
|
157
|
+
yield from records
|
|
158
|
+
offset += len(records)
|
|
159
|
+
page += 1
|
|
160
|
+
if max_pages and page >= max_pages:
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
def close(self) -> None:
|
|
164
|
+
if self._session:
|
|
165
|
+
self._session.close()
|
|
166
|
+
self._session = None
|
|
167
|
+
self._connected = False
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from killuhub.core.connector_interface import BaseConnector
|
|
2
|
+
from killuhub.core.engine_interface import BaseEngine
|
|
3
|
+
from killuhub.core.storage_interface import BaseStorageWriter
|
|
4
|
+
from killuhub.core.config import ConnectorConfig, PipelineConfig
|
|
5
|
+
from killuhub.core.registry import Registry, default_registry
|
|
6
|
+
from killuhub.core.exceptions import (
|
|
7
|
+
KilluHubError, BatchModeError, IncrementalStateError, ContractViolationError
|
|
8
|
+
)
|
|
9
|
+
from killuhub.core.batch import ExecutionMode, BatchMode, BatchConfig, StreamingConfig, IncrementalState
|
|
10
|
+
from killuhub.core.contract import ContractSpec, ColumnSpec, ContractValidator, ContractReport
|
|
11
|
+
from killuhub.core.environment import RuntimeEnvironment, detect as detect_environment
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseConnector",
|
|
15
|
+
"BaseEngine",
|
|
16
|
+
"BaseStorageWriter",
|
|
17
|
+
"ConnectorConfig",
|
|
18
|
+
"PipelineConfig",
|
|
19
|
+
"Registry",
|
|
20
|
+
"default_registry",
|
|
21
|
+
"KilluHubError",
|
|
22
|
+
"BatchModeError",
|
|
23
|
+
"IncrementalStateError",
|
|
24
|
+
"ContractViolationError",
|
|
25
|
+
"ExecutionMode",
|
|
26
|
+
"BatchMode",
|
|
27
|
+
"BatchConfig",
|
|
28
|
+
"StreamingConfig",
|
|
29
|
+
"IncrementalState",
|
|
30
|
+
"ContractSpec",
|
|
31
|
+
"ColumnSpec",
|
|
32
|
+
"ContractValidator",
|
|
33
|
+
"ContractReport",
|
|
34
|
+
"RuntimeEnvironment",
|
|
35
|
+
"detect_environment",
|
|
36
|
+
]
|
killuhub/core/batch.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch and streaming processing primitives for KilluHub.
|
|
3
|
+
|
|
4
|
+
ExecutionMode controls whether a pipeline runs as a one-shot batch job or as a
|
|
5
|
+
continuous stream. Within batch mode, BatchMode controls whether all records are
|
|
6
|
+
processed (FULL) or only records added since the last run (INCREMENTAL).
|
|
7
|
+
|
|
8
|
+
These are core-layer types — no Spark, no I/O, no external dependencies.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ExecutionMode(str, Enum):
|
|
17
|
+
"""
|
|
18
|
+
Top-level pipeline execution mode.
|
|
19
|
+
|
|
20
|
+
BATCH — one-shot job: read, transform, write, exit.
|
|
21
|
+
STREAMING — continuous job: run indefinitely, processing records as they
|
|
22
|
+
arrive (Spark Structured Streaming or Flink).
|
|
23
|
+
"""
|
|
24
|
+
BATCH = "batch"
|
|
25
|
+
STREAMING = "streaming"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BatchMode(str, Enum):
|
|
29
|
+
"""
|
|
30
|
+
Strategy for a BATCH execution run.
|
|
31
|
+
|
|
32
|
+
Using `str` as a mixin base means `BatchMode.FULL == "full"` is True
|
|
33
|
+
and the value serialises to a plain string in JSON state files without
|
|
34
|
+
a custom encoder.
|
|
35
|
+
|
|
36
|
+
FULL — read/process all available records.
|
|
37
|
+
INCREMENTAL — read only records newer than the last watermark cursor.
|
|
38
|
+
"""
|
|
39
|
+
FULL = "full"
|
|
40
|
+
INCREMENTAL = "incremental"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class BatchConfig:
|
|
45
|
+
"""
|
|
46
|
+
Batch execution parameters — only used when ExecutionMode is BATCH.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
mode:
|
|
50
|
+
FULL — read/process all available records.
|
|
51
|
+
INCREMENTAL — read only records newer than the last watermark.
|
|
52
|
+
|
|
53
|
+
watermark_column:
|
|
54
|
+
Column used to advance the incremental cursor.
|
|
55
|
+
For Bronze this is typically the source's own timestamp column
|
|
56
|
+
(e.g. `updated_at`).
|
|
57
|
+
For Silver this is typically `_ingested_at` (the Bronze metadata
|
|
58
|
+
column) so Silver can filter on a column it controls regardless
|
|
59
|
+
of the source schema.
|
|
60
|
+
|
|
61
|
+
initial_watermark:
|
|
62
|
+
ISO-8601 string used as the lower bound on the very first
|
|
63
|
+
INCREMENTAL run when no prior state exists.
|
|
64
|
+
Default epoch means "process everything on first run".
|
|
65
|
+
|
|
66
|
+
batch_id:
|
|
67
|
+
Unique identifier for this run, written to the `_batch_id`
|
|
68
|
+
metadata column in Bronze. If empty, Bronze auto-generates a UUID
|
|
69
|
+
at runtime. Injecting a deterministic ID makes reruns idempotent.
|
|
70
|
+
"""
|
|
71
|
+
mode: BatchMode
|
|
72
|
+
watermark_column: str = "updated_at"
|
|
73
|
+
initial_watermark: str = "1970-01-01T00:00:00"
|
|
74
|
+
batch_id: str = ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class StreamingConfig:
|
|
79
|
+
"""
|
|
80
|
+
Streaming execution parameters — only used when ExecutionMode is STREAMING.
|
|
81
|
+
|
|
82
|
+
Attributes:
|
|
83
|
+
trigger:
|
|
84
|
+
Spark Structured Streaming trigger type.
|
|
85
|
+
processingTime — microbatch, runs every `trigger_interval`.
|
|
86
|
+
once — processes all available data in one microbatch then stops.
|
|
87
|
+
availableNow — like once but respects rate limits.
|
|
88
|
+
continuous — experimental low-latency continuous processing.
|
|
89
|
+
|
|
90
|
+
trigger_interval:
|
|
91
|
+
Microbatch interval string (e.g. "30 seconds", "1 minute").
|
|
92
|
+
Only used when trigger is processingTime.
|
|
93
|
+
|
|
94
|
+
checkpoint_location:
|
|
95
|
+
Path where Spark stores streaming checkpoint state.
|
|
96
|
+
Must be durable storage (S3, HDFS, ADLS) in production.
|
|
97
|
+
Required for exactly-once guarantees and job restarts.
|
|
98
|
+
|
|
99
|
+
output_mode:
|
|
100
|
+
append — only new rows written each microbatch (default, Iceberg).
|
|
101
|
+
update — rows that changed are written (requires watermarking).
|
|
102
|
+
complete — full result written each time (for aggregations).
|
|
103
|
+
"""
|
|
104
|
+
trigger: str = "processingTime"
|
|
105
|
+
trigger_interval: str = "30 seconds"
|
|
106
|
+
checkpoint_location: str = "/tmp/killuhub-checkpoints"
|
|
107
|
+
output_mode: str = "append"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class IncrementalState:
|
|
112
|
+
"""
|
|
113
|
+
Persisted watermark state for an INCREMENTAL Silver run.
|
|
114
|
+
|
|
115
|
+
Each (source_table, target_table) pair has one state record.
|
|
116
|
+
After a successful Silver run the `last_watermark` is updated to
|
|
117
|
+
max(watermark_column) observed in that batch, so the next run
|
|
118
|
+
starts exactly from where this one ended.
|
|
119
|
+
|
|
120
|
+
Attributes:
|
|
121
|
+
source_table: Fully qualified bronze Iceberg table name.
|
|
122
|
+
target_table: Fully qualified silver Iceberg table name.
|
|
123
|
+
last_watermark: ISO-8601 string. Next incremental run will
|
|
124
|
+
filter `watermark_column > last_watermark`.
|
|
125
|
+
updated_at: ISO-8601 string of when this state was written.
|
|
126
|
+
Informational — not used for query filtering.
|
|
127
|
+
"""
|
|
128
|
+
source_table: str
|
|
129
|
+
target_table: str
|
|
130
|
+
last_watermark: str
|
|
131
|
+
updated_at: str = field(default="")
|
|
132
|
+
|
|
133
|
+
def state_key(self) -> str:
|
|
134
|
+
"""Stable dict key for JSON serialisation."""
|
|
135
|
+
return f"{self.source_table}::{self.target_table}"
|