killuhub 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- killuhub-0.1.0/PKG-INFO +520 -0
- killuhub-0.1.0/README.md +484 -0
- killuhub-0.1.0/killuhub/__init__.py +74 -0
- killuhub-0.1.0/killuhub/connectors/__init__.py +18 -0
- killuhub-0.1.0/killuhub/connectors/kafka/__init__.py +3 -0
- killuhub-0.1.0/killuhub/connectors/kafka/connector.py +88 -0
- killuhub-0.1.0/killuhub/connectors/mysql/__init__.py +3 -0
- killuhub-0.1.0/killuhub/connectors/mysql/connector.py +86 -0
- killuhub-0.1.0/killuhub/connectors/postgres/__init__.py +3 -0
- killuhub-0.1.0/killuhub/connectors/postgres/connector.py +59 -0
- killuhub-0.1.0/killuhub/connectors/rest_api/__init__.py +3 -0
- killuhub-0.1.0/killuhub/connectors/rest_api/connector.py +167 -0
- killuhub-0.1.0/killuhub/core/__init__.py +36 -0
- killuhub-0.1.0/killuhub/core/batch.py +135 -0
- killuhub-0.1.0/killuhub/core/config.py +38 -0
- killuhub-0.1.0/killuhub/core/connector_interface.py +45 -0
- killuhub-0.1.0/killuhub/core/contract.py +379 -0
- killuhub-0.1.0/killuhub/core/engine_interface.py +37 -0
- killuhub-0.1.0/killuhub/core/environment.py +82 -0
- killuhub-0.1.0/killuhub/core/exceptions.py +40 -0
- killuhub-0.1.0/killuhub/core/registry.py +70 -0
- killuhub-0.1.0/killuhub/core/storage_interface.py +30 -0
- killuhub-0.1.0/killuhub/ingestion/__init__.py +4 -0
- killuhub-0.1.0/killuhub/ingestion/pipeline.py +141 -0
- killuhub-0.1.0/killuhub/ingestion/scheduler.py +155 -0
- killuhub-0.1.0/killuhub/layers/__init__.py +9 -0
- killuhub-0.1.0/killuhub/layers/bronze/__init__.py +3 -0
- killuhub-0.1.0/killuhub/layers/bronze/pipeline.py +206 -0
- killuhub-0.1.0/killuhub/layers/silver/__init__.py +34 -0
- killuhub-0.1.0/killuhub/layers/silver/pipeline.py +373 -0
- killuhub-0.1.0/killuhub/layers/silver/state.py +239 -0
- killuhub-0.1.0/killuhub/layers/silver/transformations.py +259 -0
- killuhub-0.1.0/killuhub/layers/streaming/__init__.py +3 -0
- killuhub-0.1.0/killuhub/layers/streaming/pipeline.py +236 -0
- killuhub-0.1.0/killuhub/processing/__init__.py +8 -0
- killuhub-0.1.0/killuhub/processing/flink_engine.py +84 -0
- killuhub-0.1.0/killuhub/processing/spark_engine.py +236 -0
- killuhub-0.1.0/killuhub/storage/__init__.py +8 -0
- killuhub-0.1.0/killuhub/storage/delta/__init__.py +3 -0
- killuhub-0.1.0/killuhub/storage/delta/writer.py +57 -0
- killuhub-0.1.0/killuhub/storage/hudi/__init__.py +3 -0
- killuhub-0.1.0/killuhub/storage/hudi/writer.py +47 -0
- killuhub-0.1.0/killuhub/storage/iceberg/__init__.py +4 -0
- killuhub-0.1.0/killuhub/storage/iceberg/schema_manager.py +112 -0
- killuhub-0.1.0/killuhub/storage/iceberg/writer.py +110 -0
- killuhub-0.1.0/killuhub.egg-info/PKG-INFO +520 -0
- killuhub-0.1.0/killuhub.egg-info/SOURCES.txt +52 -0
- killuhub-0.1.0/killuhub.egg-info/dependency_links.txt +1 -0
- killuhub-0.1.0/killuhub.egg-info/requires.txt +38 -0
- killuhub-0.1.0/killuhub.egg-info/top_level.txt +1 -0
- killuhub-0.1.0/pyproject.toml +57 -0
- killuhub-0.1.0/setup.cfg +4 -0
- killuhub-0.1.0/tests/test_pipeline.py +111 -0
- killuhub-0.1.0/tests/test_registry.py +52 -0
killuhub-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: killuhub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pluggable data ingestion framework — connectors, Spark/Flink processing, Iceberg storage
|
|
5
|
+
Author: Kalleby Ramos
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Provides-Extra: postgres
|
|
11
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == "postgres"
|
|
12
|
+
Provides-Extra: kafka
|
|
13
|
+
Requires-Dist: confluent-kafka>=2.3; extra == "kafka"
|
|
14
|
+
Provides-Extra: s3
|
|
15
|
+
Requires-Dist: boto3>=1.34; extra == "s3"
|
|
16
|
+
Requires-Dist: pyarrow>=15.0; extra == "s3"
|
|
17
|
+
Provides-Extra: rest-api
|
|
18
|
+
Requires-Dist: requests>=2.31; extra == "rest-api"
|
|
19
|
+
Provides-Extra: spark
|
|
20
|
+
Requires-Dist: pyspark>=3.5; extra == "spark"
|
|
21
|
+
Provides-Extra: flink
|
|
22
|
+
Requires-Dist: apache-flink>=1.18; extra == "flink"
|
|
23
|
+
Provides-Extra: iceberg
|
|
24
|
+
Requires-Dist: pyiceberg>=0.6; extra == "iceberg"
|
|
25
|
+
Provides-Extra: delta
|
|
26
|
+
Requires-Dist: delta-spark>=3.1; extra == "delta"
|
|
27
|
+
Provides-Extra: scheduler
|
|
28
|
+
Requires-Dist: apscheduler>=3.10; extra == "scheduler"
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: killuhub[iceberg,kafka,postgres,rest_api,s3,scheduler,spark]; extra == "all"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-mock>=3.12; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.9; extra == "dev"
|
|
36
|
+
|
|
37
|
+
# KilluHub
|
|
38
|
+
|
|
39
|
+
A pluggable lakehouse ingestion framework — connect any source, process with Spark or Flink, land in Apache Iceberg following the medallion architecture (Bronze → Silver → Gold).
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Architecture
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
┌─────────────────────────────────────────────────────┐
|
|
47
|
+
│ Data Sources │
|
|
48
|
+
│ PostgreSQL · MySQL · Kafka · REST API │
|
|
49
|
+
└─────────────────────┬───────────────────────────────┘
|
|
50
|
+
│
|
|
51
|
+
▼
|
|
52
|
+
┌─────────────────────────────────────────────────────┐
|
|
53
|
+
│ Connectors │
|
|
54
|
+
│ BaseConnector — connect() · extract() · close() │
|
|
55
|
+
│ Server-side cursors · Pagination · Manual commits │
|
|
56
|
+
└─────────────────────┬───────────────────────────────┘
|
|
57
|
+
│ yields dict[str, Any] records
|
|
58
|
+
▼
|
|
59
|
+
┌─────────────────────────────────────────────────────┐
|
|
60
|
+
│ Processing Engines │
|
|
61
|
+
│ Spark 3.5 (batch + streaming) │
|
|
62
|
+
│ Flink (Table API, streaming-first) │
|
|
63
|
+
└─────────────────────┬───────────────────────────────┘
|
|
64
|
+
│ DataFrame (Spark or Flink)
|
|
65
|
+
▼
|
|
66
|
+
┌─────────────────────────────────────────────────────┐
|
|
67
|
+
│ Medallion Pipelines │
|
|
68
|
+
│ │
|
|
69
|
+
│ Bronze ────────────────────────────────────────── │
|
|
70
|
+
│ Raw data + metadata stamps + data contracts │
|
|
71
|
+
│ _ingested_at · _source_name · _batch_id │
|
|
72
|
+
│ │
|
|
73
|
+
│ Silver ────────────────────────────────────────── │
|
|
74
|
+
│ Dedup · type cast · date dims · upsert │
|
|
75
|
+
│ │
|
|
76
|
+
│ Gold (roadmap) ────────────────────────────────── │
|
|
77
|
+
│ Aggregations · business metrics · serving layer │
|
|
78
|
+
└─────────────────────┬───────────────────────────────┘
|
|
79
|
+
│
|
|
80
|
+
▼
|
|
81
|
+
┌─────────────────────────────────────────────────────┐
|
|
82
|
+
│ Storage Layer │
|
|
83
|
+
│ Apache Iceberg ✅ · Delta Lake · Apache Hudi │
|
|
84
|
+
│ append · overwrite · merge (MERGE INTO) │
|
|
85
|
+
│ S3 · HDFS · local filesystem │
|
|
86
|
+
└─────────────────────────────────────────────────────┘
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Key capabilities
|
|
92
|
+
|
|
93
|
+
| Capability | Description |
|
|
94
|
+
|---|---|
|
|
95
|
+
| **Connector abstraction** | Any source (SQL, Kafka, REST) implements the same three-method interface. Swap sources without touching pipeline logic. |
|
|
96
|
+
| **Watermark-based ingestion** | Automatic incremental extraction — only records newer than the last saved watermark are fetched. No manual state tracking. |
|
|
97
|
+
| **Data contracts** | Schema enforcement at ingestion time: type validation, null checks, min/max bounds. Pipelines fail fast on bad data. |
|
|
98
|
+
| **Medallion pipelines** | Bronze stamps raw metadata; Silver deduplicates, casts types, and upserts. One `type: chain` config runs both in sequence. |
|
|
99
|
+
| **Multi-engine processing** | The same connector and config works with Spark (batch + structured streaming) and Flink (streaming-first). Switch with one string. |
|
|
100
|
+
| **Multi-storage lakehouse** | Iceberg (primary), Delta Lake, and Hudi are drop-in alternatives. Swap with `storage_writer_name`. |
|
|
101
|
+
| **Config-driven pipelines** | Everything — connector, engine, writer, watermark, contract — lives in one YAML file. No Python required for standard use cases. |
|
|
102
|
+
| **Multi-platform deployment** | One Helm chart targets EKS (Spark Operator), Databricks (Asset Bundles), and AWS EMR. Platform-specific details are injected by the chart, not the config. |
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Design principles
|
|
107
|
+
|
|
108
|
+
- **Engine agnostic** — Connectors yield plain Python dicts. They have no knowledge of Spark or Flink. Engines receive dicts and build their own DataFrames.
|
|
109
|
+
- **Storage agnostic** — Writers implement a single `write(df, table, mode)` interface. Iceberg, Delta, and Hudi are plug-in implementations.
|
|
110
|
+
- **Config-driven** — A complete pipeline — source, engine, transform, target, contract, schedule — is expressed as a YAML file. Pipeline logic lives in the framework, not in user code.
|
|
111
|
+
- **Medallion architecture** — Bronze is the canonical raw layer (immutable, metadata-stamped). Silver is always derived from Bronze, never from the source directly. This makes reprocessing safe and auditable.
|
|
112
|
+
- **Stateless pipelines** — Each `Pipeline.run()` creates fresh instances of connector, engine, and writer. No shared state between runs. Scheduled jobs can run in parallel safely.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Data contracts
|
|
117
|
+
|
|
118
|
+
Every Bronze and Silver pipeline can declare a **data contract** — a schema specification that is validated before data is written. Violations either fail the pipeline or log a warning, depending on `on_violation`.
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
contract:
|
|
122
|
+
on_violation: fail # fail | warn
|
|
123
|
+
min_row_count: 1 # fail if fewer rows than this were ingested
|
|
124
|
+
|
|
125
|
+
columns:
|
|
126
|
+
- name: order_id
|
|
127
|
+
type: long
|
|
128
|
+
nullable: false # null check enforced
|
|
129
|
+
|
|
130
|
+
- name: amount
|
|
131
|
+
type: double
|
|
132
|
+
nullable: false
|
|
133
|
+
min_value: 0 # value range enforced
|
|
134
|
+
|
|
135
|
+
- name: created_at
|
|
136
|
+
type: timestamp
|
|
137
|
+
nullable: false
|
|
138
|
+
|
|
139
|
+
- name: status
|
|
140
|
+
type: string
|
|
141
|
+
nullable: false
|
|
142
|
+
allowed_values: [pending, confirmed, shipped, cancelled]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### What is validated
|
|
146
|
+
|
|
147
|
+
| Rule | YAML key | Description |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| Type check | `type` | Column must match the declared Spark/Iceberg type |
|
|
150
|
+
| Null check | `nullable: false` | No nulls allowed in this column |
|
|
151
|
+
| Min value | `min_value` | Rejects rows below this numeric threshold |
|
|
152
|
+
| Max value | `max_value` | Rejects rows above this numeric threshold |
|
|
153
|
+
| Allowed values | `allowed_values` | Enum constraint — only these values are valid |
|
|
154
|
+
| Row count | `min_row_count` | Minimum rows the batch must contain |
|
|
155
|
+
|
|
156
|
+
Contracts make data quality a first-class concern — not an afterthought in a downstream dbt model.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Project structure
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
killuhub/
|
|
164
|
+
├── core/
|
|
165
|
+
│ ├── connector_interface.py # BaseConnector ABC
|
|
166
|
+
│ ├── engine_interface.py # BaseEngine ABC
|
|
167
|
+
│ ├── storage_interface.py # BaseStorageWriter ABC
|
|
168
|
+
│ ├── config.py # ConnectorConfig, PipelineConfig
|
|
169
|
+
│ ├── registry.py # Registry + default_registry singleton
|
|
170
|
+
│ ├── batch.py # BatchConfig, BatchMode, StreamingConfig
|
|
171
|
+
│ ├── contract.py # ContractSpec, ContractValidator
|
|
172
|
+
│ └── exceptions.py # KilluHubError hierarchy
|
|
173
|
+
│
|
|
174
|
+
├── connectors/
|
|
175
|
+
│ ├── postgres/connector.py # Server-side cursor, incremental watermark
|
|
176
|
+
│ ├── mysql/connector.py # Streaming cursor, fetchmany, optional TLS
|
|
177
|
+
│ ├── kafka/connector.py # confluent-kafka, batch + stream modes
|
|
178
|
+
│ └── rest_api/connector.py # Page / cursor / offset pagination
|
|
179
|
+
│
|
|
180
|
+
├── processing/
|
|
181
|
+
│ ├── spark_engine.py # PySpark 3.5 + Iceberg catalog wiring
|
|
182
|
+
│ └── flink_engine.py # PyFlink Table API + Iceberg catalog
|
|
183
|
+
│
|
|
184
|
+
├── storage/
|
|
185
|
+
│ ├── iceberg/
|
|
186
|
+
│ │ ├── writer.py # append / overwrite / merge via MERGE INTO
|
|
187
|
+
│ │ └── schema_manager.py # schema evolution, time travel, compaction
|
|
188
|
+
│ ├── delta/writer.py # Delta Lake (drop-in alternative)
|
|
189
|
+
│ └── hudi/writer.py # Apache Hudi (drop-in alternative)
|
|
190
|
+
│
|
|
191
|
+
├── ingestion/
|
|
192
|
+
│ ├── pipeline.py # Low-level connector → engine → writer loop
|
|
193
|
+
│ └── scheduler.py # APScheduler cron + interval jobs
|
|
194
|
+
│
|
|
195
|
+
└── layers/
|
|
196
|
+
├── bronze/pipeline.py # BronzePipeline — metadata stamping + contract
|
|
197
|
+
├── silver/pipeline.py # SilverPipeline — dedup, cast, partition, upsert
|
|
198
|
+
├── streaming/pipeline.py # StreamingBronzePipeline — Spark Structured Streaming
|
|
199
|
+
└── gold/ # (roadmap)
|
|
200
|
+
|
|
201
|
+
config/
|
|
202
|
+
├── bronze_postgres.yaml # Batch bronze from Postgres
|
|
203
|
+
├── bronze_kafka.yaml # Streaming bronze from Kafka
|
|
204
|
+
├── silver_orders.yaml # Silver from bronze
|
|
205
|
+
├── chain_orders.yaml # Bronze + Silver in one file (Postgres)
|
|
206
|
+
└── chain_api_orders.yaml # Bronze + Silver in one file (REST API)
|
|
207
|
+
|
|
208
|
+
helm/killuhub/ # Helm chart — deploys to EKS / Databricks / EMR
|
|
209
|
+
├── Chart.yaml
|
|
210
|
+
├── values.yaml # Single user-facing interface
|
|
211
|
+
└── templates/
|
|
212
|
+
├── configmap.yaml # Renders pipeline YAML — used on all platforms
|
|
213
|
+
├── spark-application.yaml # Spark Operator CRD (EKS only)
|
|
214
|
+
├── rbac.yaml # ServiceAccount + IRSA (EKS only)
|
|
215
|
+
├── databricks-job.yaml # Databricks Asset Bundle (Databricks only)
|
|
216
|
+
└── emr-step.yaml # EMR step + cluster JSON (EMR only)
|
|
217
|
+
|
|
218
|
+
docs/
|
|
219
|
+
├── core/core.md
|
|
220
|
+
├── connectors/connectors.md
|
|
221
|
+
├── processing/processing.md
|
|
222
|
+
├── storage/storage.md
|
|
223
|
+
├── ingestion/ingestion.md
|
|
224
|
+
├── layers/layers.md
|
|
225
|
+
├── helm/helm.md
|
|
226
|
+
└── usage/usage.md # End-to-end how-to for every scenario
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Quick start
|
|
232
|
+
|
|
233
|
+
### Install
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
# Minimum install
|
|
237
|
+
pip install -e .
|
|
238
|
+
|
|
239
|
+
# With extras for your use case
|
|
240
|
+
pip install -e ".[postgres,spark,iceberg]"
|
|
241
|
+
pip install -e ".[kafka,spark,iceberg]"
|
|
242
|
+
pip install -e ".[mysql,spark,iceberg]"
|
|
243
|
+
pip install -e ".[all]"
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Bronze + Silver in one config (recommended)
|
|
247
|
+
|
|
248
|
+
The standard pattern is a **chain** config: one file runs bronze first, then silver automatically.
|
|
249
|
+
`silver.bronze_table` is auto-injected from the bronze stage — you never repeat the table name.
|
|
250
|
+
|
|
251
|
+
```yaml
|
|
252
|
+
# config/chain_orders.yaml
|
|
253
|
+
type: chain
|
|
254
|
+
|
|
255
|
+
engine:
|
|
256
|
+
name: spark
|
|
257
|
+
warehouse: ${WAREHOUSE:-/tmp/killuhub-warehouse}
|
|
258
|
+
catalog_name: ${CATALOG:-local}
|
|
259
|
+
catalog_type: hadoop
|
|
260
|
+
|
|
261
|
+
stages:
|
|
262
|
+
- name: bronze-orders
|
|
263
|
+
type: bronze
|
|
264
|
+
mode: batch
|
|
265
|
+
batch:
|
|
266
|
+
strategy: incremental
|
|
267
|
+
watermark_column: updated_at
|
|
268
|
+
initial_watermark: "2024-01-01T00:00:00"
|
|
269
|
+
connector:
|
|
270
|
+
name: postgres
|
|
271
|
+
config:
|
|
272
|
+
host: ${PG_HOST:-localhost}
|
|
273
|
+
database: shop
|
|
274
|
+
user: postgres
|
|
275
|
+
password: ${PG_PASSWORD}
|
|
276
|
+
query: "SELECT * FROM orders"
|
|
277
|
+
bronze:
|
|
278
|
+
table: local.bronze.orders
|
|
279
|
+
source_name: postgres.shop.orders
|
|
280
|
+
partition_by: [_ingestion_date]
|
|
281
|
+
|
|
282
|
+
- name: silver-orders
|
|
283
|
+
type: silver
|
|
284
|
+
mode: batch
|
|
285
|
+
batch:
|
|
286
|
+
strategy: incremental
|
|
287
|
+
watermark_column: _ingested_at
|
|
288
|
+
initial_watermark: "1970-01-01T00:00:00"
|
|
289
|
+
silver:
|
|
290
|
+
# bronze_table is auto-injected from the bronze stage above
|
|
291
|
+
silver_table: local.silver.orders
|
|
292
|
+
key_columns: [order_id]
|
|
293
|
+
date_columns: [created_at, updated_at]
|
|
294
|
+
type_map: { amount: double, quantity: int }
|
|
295
|
+
null_check_columns: [order_id, customer_id]
|
|
296
|
+
partition_by: [created_date]
|
|
297
|
+
state_store: json
|
|
298
|
+
contract:
|
|
299
|
+
on_violation: fail
|
|
300
|
+
columns:
|
|
301
|
+
- { name: order_id, type: long, nullable: false }
|
|
302
|
+
- { name: amount, type: double, nullable: false, min_value: 0 }
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
python main.py --config config/chain_orders.yaml
|
|
307
|
+
|
|
308
|
+
# Dry-run — validate config without executing
|
|
309
|
+
python main.py --config config/chain_orders.yaml --dry-run
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Standalone bronze (batch)
|
|
313
|
+
|
|
314
|
+
```yaml
|
|
315
|
+
# config/bronze_postgres.yaml
|
|
316
|
+
type: bronze
|
|
317
|
+
mode: batch
|
|
318
|
+
batch:
|
|
319
|
+
strategy: incremental
|
|
320
|
+
watermark_column: updated_at
|
|
321
|
+
initial_watermark: "2024-01-01T00:00:00"
|
|
322
|
+
connector:
|
|
323
|
+
name: postgres
|
|
324
|
+
config:
|
|
325
|
+
host: localhost
|
|
326
|
+
database: shop
|
|
327
|
+
user: postgres
|
|
328
|
+
password: ${PG_PASSWORD}
|
|
329
|
+
query: "SELECT * FROM orders"
|
|
330
|
+
engine:
|
|
331
|
+
name: spark
|
|
332
|
+
warehouse: /tmp/killuhub-warehouse
|
|
333
|
+
catalog_name: local
|
|
334
|
+
catalog_type: hadoop
|
|
335
|
+
bronze:
|
|
336
|
+
table: local.bronze.orders
|
|
337
|
+
source_name: postgres.shop.orders
|
|
338
|
+
partition_by: [_ingestion_date]
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
```bash
|
|
342
|
+
python main.py --config config/bronze_postgres.yaml
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Streaming bronze (Kafka)
|
|
346
|
+
|
|
347
|
+
```yaml
|
|
348
|
+
type: bronze
|
|
349
|
+
mode: streaming
|
|
350
|
+
streaming:
|
|
351
|
+
trigger: processingTime
|
|
352
|
+
trigger_interval: "30 seconds"
|
|
353
|
+
checkpoint_location: ${CHECKPOINT_PATH:-/tmp/checkpoints}
|
|
354
|
+
output_mode: append
|
|
355
|
+
connector:
|
|
356
|
+
name: kafka
|
|
357
|
+
stream_format: kafka
|
|
358
|
+
stream_options:
|
|
359
|
+
kafka.bootstrap.servers: ${KAFKA_BROKERS:-localhost:9092}
|
|
360
|
+
subscribe: ${KAFKA_TOPIC:-orders}
|
|
361
|
+
startingOffsets: latest
|
|
362
|
+
engine:
|
|
363
|
+
name: spark
|
|
364
|
+
warehouse: /tmp/killuhub-warehouse
|
|
365
|
+
catalog_name: local
|
|
366
|
+
catalog_type: hadoop
|
|
367
|
+
bronze:
|
|
368
|
+
table: local.bronze.orders
|
|
369
|
+
source_name: kafka.orders
|
|
370
|
+
partition_by: [_ingestion_date]
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Custom connector
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
from killuhub.core import BaseConnector, ConnectorConfig, default_registry
|
|
377
|
+
|
|
378
|
+
class MongoConnector(BaseConnector):
|
|
379
|
+
def connect(self): ...
|
|
380
|
+
def extract(self): yield from my_collection.find()
|
|
381
|
+
def close(self): ...
|
|
382
|
+
|
|
383
|
+
default_registry.register_connector("mongo", MongoConnector)
|
|
384
|
+
|
|
385
|
+
# Now usable in any config: connector.name: mongo
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
---
|
|
389
|
+
|
|
390
|
+
## Pipeline types
|
|
391
|
+
|
|
392
|
+
| Type | Description |
|
|
393
|
+
|------|-------------|
|
|
394
|
+
| `bronze` | Ingest raw data from a source, stamp metadata columns, write to Iceberg. Supports `batch` and `streaming` mode. |
|
|
395
|
+
| `silver` | Read from Bronze Iceberg table, deduplicate, cast types, add date dimensions, write to Silver Iceberg. Always `batch`. |
|
|
396
|
+
| `chain` | Run multiple stages in order (typically bronze → silver) from a single config file. |
|
|
397
|
+
|
|
398
|
+
### Batch strategies
|
|
399
|
+
|
|
400
|
+
Both `bronze` and `silver` support:
|
|
401
|
+
|
|
402
|
+
| `batch.strategy` | Description |
|
|
403
|
+
|-----------------|-------------|
|
|
404
|
+
| `incremental` | Read only records newer than the last saved watermark. Efficient for daily/hourly runs. |
|
|
405
|
+
| `full` | Read everything from the source on every run. Use for small tables or full reprocessing. |
|
|
406
|
+
|
|
407
|
+
---
|
|
408
|
+
|
|
409
|
+
## Available connectors
|
|
410
|
+
|
|
411
|
+
| Connector | Source type | Supports incremental | Mode |
|
|
412
|
+
|------------|-------------|---------------------|------|
|
|
413
|
+
| `postgres` | PostgreSQL | Yes (watermark column) | batch |
|
|
414
|
+
| `mysql` | MySQL | Yes (watermark column) | batch |
|
|
415
|
+
| `kafka` | Kafka topic | Yes (offset tracking) | batch + streaming |
|
|
416
|
+
| `rest_api` | HTTP/REST | Yes (watermark column) | batch |
|
|
417
|
+
|
|
418
|
+
> S3 is a **destination** in KilluHub (Iceberg tables stored on S3), not a source connector.
|
|
419
|
+
> Data in S3 files is read directly by Spark via the engine's `warehouse` path.
|
|
420
|
+
|
|
421
|
+
---
|
|
422
|
+
|
|
423
|
+
## Storage format comparison
|
|
424
|
+
|
|
425
|
+
| Feature | Iceberg ✅ (primary) | Delta Lake | Hudi |
|
|
426
|
+
|--------------------------|----------------------|------------|------------|
|
|
427
|
+
| Multi-engine | Spark, Flink, Trino, Presto, Snowflake | Spark, Databricks-first | Spark, Flink |
|
|
428
|
+
| ACID transactions | Yes (v2) | Yes | Yes |
|
|
429
|
+
| Schema evolution | Yes | Yes | Yes |
|
|
430
|
+
| Time travel | Yes (snapshots) | Yes (versions) | Yes (timeline) |
|
|
431
|
+
| Hidden partitioning | Yes | No | No |
|
|
432
|
+
| Open format | Yes (no vendor lock) | Partially | Yes |
|
|
433
|
+
| Databricks compatible | Unity Catalog v2 | Native | Yes |
|
|
434
|
+
| Upsert strategy | MERGE INTO SQL | MERGE INTO | Copy-on-Write / Merge-on-Read |
|
|
435
|
+
|
|
436
|
+
**Iceberg is the primary choice** because it was designed for multi-engine access. Write with Spark today, query with Trino tomorrow, move to Databricks later — Iceberg handles it without migration.
|
|
437
|
+
|
|
438
|
+
---
|
|
439
|
+
|
|
440
|
+
## Bronze metadata columns
|
|
441
|
+
|
|
442
|
+
Every row written by `BronzePipeline` gets these columns stamped automatically:
|
|
443
|
+
|
|
444
|
+
| Column | Type | Description |
|
|
445
|
+
|--------|------|-------------|
|
|
446
|
+
| `_ingested_at` | TIMESTAMP | When this batch was written |
|
|
447
|
+
| `_source_name` | STRING | Human label from `bronze.source_name` |
|
|
448
|
+
| `_batch_id` | STRING | UUID for this run (idempotency key) |
|
|
449
|
+
| `_batch_mode` | STRING | `"full"` or `"incremental"` |
|
|
450
|
+
| `_ingestion_date` | DATE | Date partition column |
|
|
451
|
+
|
|
452
|
+
---
|
|
453
|
+
|
|
454
|
+
## Deployment
|
|
455
|
+
|
|
456
|
+
KilluHub runs anywhere via the Helm chart. Set `platform` and fill the matching section:
|
|
457
|
+
|
|
458
|
+
```bash
|
|
459
|
+
# EKS (Spark Operator)
|
|
460
|
+
helm install killuhub-orders ./helm/killuhub \
|
|
461
|
+
-f helm/killuhub/values.yaml \
|
|
462
|
+
--set platform=eks \
|
|
463
|
+
--set pipeline.bronze.table=prod.bronze.orders
|
|
464
|
+
|
|
465
|
+
# Databricks
|
|
466
|
+
helm install killuhub-orders ./helm/killuhub \
|
|
467
|
+
-f helm/killuhub/values.yaml \
|
|
468
|
+
--set platform=databricks \
|
|
469
|
+
--set databricks.warehouse=s3://my-lake/warehouse
|
|
470
|
+
|
|
471
|
+
# AWS EMR
|
|
472
|
+
helm install killuhub-orders ./helm/killuhub \
|
|
473
|
+
-f helm/killuhub/values.yaml \
|
|
474
|
+
--set platform=emr \
|
|
475
|
+
--set emr.configBucket=s3://my-bucket/killuhub
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
The chart injects the correct `engine:` block (catalog type, warehouse path, Unity Catalog name, etc.) into the rendered `pipeline.yaml` ConfigMap. Your pipeline config never contains platform-specific details.
|
|
479
|
+
|
|
480
|
+
---
|
|
481
|
+
|
|
482
|
+
## Roadmap
|
|
483
|
+
|
|
484
|
+
- [ ] Gold pipeline — aggregations, business metrics, serving layer
|
|
485
|
+
- [ ] CDC connectors — Debezium (Postgres/MySQL), DynamoDB Streams
|
|
486
|
+
- [ ] Schema registry integration — Avro/Protobuf deserialization from Confluent Schema Registry
|
|
487
|
+
- [ ] Iceberg compaction service — scheduled file compaction and snapshot expiry
|
|
488
|
+
- [ ] Data quality checks — row-level anomaly detection, freshness SLAs
|
|
489
|
+
- [ ] OpenTelemetry tracing — spans for each pipeline stage, exportable to Datadog/Grafana
|
|
490
|
+
- [ ] Great Expectations integration — contract DSL backed by GE expectation suites
|
|
491
|
+
- [ ] REST API for pipeline management — trigger, status, history via FastAPI
|
|
492
|
+
|
|
493
|
+
---
|
|
494
|
+
|
|
495
|
+
## Dependencies
|
|
496
|
+
|
|
497
|
+
| Extra | Package | Purpose |
|
|
498
|
+
|-------------|----------------------|--------------------------------|
|
|
499
|
+
| `postgres` | psycopg2-binary | PostgreSQL driver |
|
|
500
|
+
| `mysql` | mysql-connector-python | MySQL driver |
|
|
501
|
+
| `kafka` | confluent-kafka | Kafka consumer |
|
|
502
|
+
| `rest_api` | requests | HTTP client |
|
|
503
|
+
| `spark` | pyspark | Spark processing engine |
|
|
504
|
+
| `flink` | apache-flink | Flink processing engine |
|
|
505
|
+
| `iceberg` | pyiceberg | Iceberg Python client |
|
|
506
|
+
| `delta` | delta-spark | Delta Lake writer |
|
|
507
|
+
| `scheduler` | apscheduler | Cron + interval job scheduling |
|
|
508
|
+
|
|
509
|
+
---
|
|
510
|
+
|
|
511
|
+
## Study guides
|
|
512
|
+
|
|
513
|
+
- [Core layer](docs/core/core.md) — Abstract interfaces, Registry pattern, Config system
|
|
514
|
+
- [Connectors](docs/connectors/connectors.md) — Postgres, MySQL, Kafka, REST API internals
|
|
515
|
+
- [Processing engines](docs/processing/processing.md) — Spark and Flink, when to use each
|
|
516
|
+
- [Storage layer](docs/storage/storage.md) — Iceberg deep dive, Delta and Hudi comparison
|
|
517
|
+
- [Ingestion layer](docs/ingestion/ingestion.md) — Pipeline orchestration and Scheduler
|
|
518
|
+
- [Layers](docs/layers/layers.md) — Bronze and Silver pipeline internals
|
|
519
|
+
- [Helm chart](docs/helm/helm.md) — Multi-platform deployment
|
|
520
|
+
- [Usage guide](docs/usage/usage.md) — End-to-end how-to for every scenario
|