killuhub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. killuhub-0.1.0/PKG-INFO +520 -0
  2. killuhub-0.1.0/README.md +484 -0
  3. killuhub-0.1.0/killuhub/__init__.py +74 -0
  4. killuhub-0.1.0/killuhub/connectors/__init__.py +18 -0
  5. killuhub-0.1.0/killuhub/connectors/kafka/__init__.py +3 -0
  6. killuhub-0.1.0/killuhub/connectors/kafka/connector.py +88 -0
  7. killuhub-0.1.0/killuhub/connectors/mysql/__init__.py +3 -0
  8. killuhub-0.1.0/killuhub/connectors/mysql/connector.py +86 -0
  9. killuhub-0.1.0/killuhub/connectors/postgres/__init__.py +3 -0
  10. killuhub-0.1.0/killuhub/connectors/postgres/connector.py +59 -0
  11. killuhub-0.1.0/killuhub/connectors/rest_api/__init__.py +3 -0
  12. killuhub-0.1.0/killuhub/connectors/rest_api/connector.py +167 -0
  13. killuhub-0.1.0/killuhub/core/__init__.py +36 -0
  14. killuhub-0.1.0/killuhub/core/batch.py +135 -0
  15. killuhub-0.1.0/killuhub/core/config.py +38 -0
  16. killuhub-0.1.0/killuhub/core/connector_interface.py +45 -0
  17. killuhub-0.1.0/killuhub/core/contract.py +379 -0
  18. killuhub-0.1.0/killuhub/core/engine_interface.py +37 -0
  19. killuhub-0.1.0/killuhub/core/environment.py +82 -0
  20. killuhub-0.1.0/killuhub/core/exceptions.py +40 -0
  21. killuhub-0.1.0/killuhub/core/registry.py +70 -0
  22. killuhub-0.1.0/killuhub/core/storage_interface.py +30 -0
  23. killuhub-0.1.0/killuhub/ingestion/__init__.py +4 -0
  24. killuhub-0.1.0/killuhub/ingestion/pipeline.py +141 -0
  25. killuhub-0.1.0/killuhub/ingestion/scheduler.py +155 -0
  26. killuhub-0.1.0/killuhub/layers/__init__.py +9 -0
  27. killuhub-0.1.0/killuhub/layers/bronze/__init__.py +3 -0
  28. killuhub-0.1.0/killuhub/layers/bronze/pipeline.py +206 -0
  29. killuhub-0.1.0/killuhub/layers/silver/__init__.py +34 -0
  30. killuhub-0.1.0/killuhub/layers/silver/pipeline.py +373 -0
  31. killuhub-0.1.0/killuhub/layers/silver/state.py +239 -0
  32. killuhub-0.1.0/killuhub/layers/silver/transformations.py +259 -0
  33. killuhub-0.1.0/killuhub/layers/streaming/__init__.py +3 -0
  34. killuhub-0.1.0/killuhub/layers/streaming/pipeline.py +236 -0
  35. killuhub-0.1.0/killuhub/processing/__init__.py +8 -0
  36. killuhub-0.1.0/killuhub/processing/flink_engine.py +84 -0
  37. killuhub-0.1.0/killuhub/processing/spark_engine.py +236 -0
  38. killuhub-0.1.0/killuhub/storage/__init__.py +8 -0
  39. killuhub-0.1.0/killuhub/storage/delta/__init__.py +3 -0
  40. killuhub-0.1.0/killuhub/storage/delta/writer.py +57 -0
  41. killuhub-0.1.0/killuhub/storage/hudi/__init__.py +3 -0
  42. killuhub-0.1.0/killuhub/storage/hudi/writer.py +47 -0
  43. killuhub-0.1.0/killuhub/storage/iceberg/__init__.py +4 -0
  44. killuhub-0.1.0/killuhub/storage/iceberg/schema_manager.py +112 -0
  45. killuhub-0.1.0/killuhub/storage/iceberg/writer.py +110 -0
  46. killuhub-0.1.0/killuhub.egg-info/PKG-INFO +520 -0
  47. killuhub-0.1.0/killuhub.egg-info/SOURCES.txt +52 -0
  48. killuhub-0.1.0/killuhub.egg-info/dependency_links.txt +1 -0
  49. killuhub-0.1.0/killuhub.egg-info/requires.txt +38 -0
  50. killuhub-0.1.0/killuhub.egg-info/top_level.txt +1 -0
  51. killuhub-0.1.0/pyproject.toml +57 -0
  52. killuhub-0.1.0/setup.cfg +4 -0
  53. killuhub-0.1.0/tests/test_pipeline.py +111 -0
  54. killuhub-0.1.0/tests/test_registry.py +52 -0
@@ -0,0 +1,520 @@
1
+ Metadata-Version: 2.4
2
+ Name: killuhub
3
+ Version: 0.1.0
4
+ Summary: Pluggable data ingestion framework — connectors, Spark/Flink processing, Iceberg storage
5
+ Author: Kalleby Ramos
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pydantic>=2.0
10
+ Provides-Extra: postgres
11
+ Requires-Dist: psycopg2-binary>=2.9; extra == "postgres"
12
+ Provides-Extra: kafka
13
+ Requires-Dist: confluent-kafka>=2.3; extra == "kafka"
14
+ Provides-Extra: s3
15
+ Requires-Dist: boto3>=1.34; extra == "s3"
16
+ Requires-Dist: pyarrow>=15.0; extra == "s3"
17
+ Provides-Extra: rest-api
18
+ Requires-Dist: requests>=2.31; extra == "rest-api"
19
+ Provides-Extra: spark
20
+ Requires-Dist: pyspark>=3.5; extra == "spark"
21
+ Provides-Extra: flink
22
+ Requires-Dist: apache-flink>=1.18; extra == "flink"
23
+ Provides-Extra: iceberg
24
+ Requires-Dist: pyiceberg>=0.6; extra == "iceberg"
25
+ Provides-Extra: delta
26
+ Requires-Dist: delta-spark>=3.1; extra == "delta"
27
+ Provides-Extra: scheduler
28
+ Requires-Dist: apscheduler>=3.10; extra == "scheduler"
29
+ Provides-Extra: all
30
+ Requires-Dist: killuhub[iceberg,kafka,postgres,rest_api,s3,scheduler,spark]; extra == "all"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=8.0; extra == "dev"
33
+ Requires-Dist: pytest-mock>=3.12; extra == "dev"
34
+ Requires-Dist: ruff>=0.4; extra == "dev"
35
+ Requires-Dist: mypy>=1.9; extra == "dev"
36
+
37
+ # KilluHub
38
+
39
+ A pluggable lakehouse ingestion framework — connect any source, process with Spark or Flink, land in Apache Iceberg following the medallion architecture (Bronze → Silver → Gold).
40
+
41
+ ---
42
+
43
+ ## Architecture
44
+
45
+ ```
46
+ ┌─────────────────────────────────────────────────────┐
47
+ │ Data Sources │
48
+ │ PostgreSQL · MySQL · Kafka · REST API │
49
+ └─────────────────────┬───────────────────────────────┘
50
+
51
+
52
+ ┌─────────────────────────────────────────────────────┐
53
+ │ Connectors │
54
+ │ BaseConnector — connect() · extract() · close() │
55
+ │ Server-side cursors · Pagination · Manual commits │
56
+ └─────────────────────┬───────────────────────────────┘
57
+ │ yields dict[str, Any] records
58
+
59
+ ┌─────────────────────────────────────────────────────┐
60
+ │ Processing Engines │
61
+ │ Spark 3.5 (batch + streaming) │
62
+ │ Flink (Table API, streaming-first) │
63
+ └─────────────────────┬───────────────────────────────┘
64
+ │ DataFrame (Spark or Flink)
65
+
66
+ ┌─────────────────────────────────────────────────────┐
67
+ │ Medallion Pipelines │
68
+ │ │
69
+ │ Bronze ────────────────────────────────────────── │
70
+ │ Raw data + metadata stamps + data contracts │
71
+ │ _ingested_at · _source_name · _batch_id │
72
+ │ │
73
+ │ Silver ────────────────────────────────────────── │
74
+ │ Dedup · type cast · date dims · upsert │
75
+ │ │
76
+ │ Gold (roadmap) ────────────────────────────────── │
77
+ │ Aggregations · business metrics · serving layer │
78
+ └─────────────────────┬───────────────────────────────┘
79
+
80
+
81
+ ┌─────────────────────────────────────────────────────┐
82
+ │ Storage Layer │
83
+ │ Apache Iceberg ✅ · Delta Lake · Apache Hudi │
84
+ │ append · overwrite · merge (MERGE INTO) │
85
+ │ S3 · HDFS · local filesystem │
86
+ └─────────────────────────────────────────────────────┘
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Key capabilities
92
+
93
+ | Capability | Description |
94
+ |---|---|
95
+ | **Connector abstraction** | Any source (SQL, Kafka, REST) implements the same three-method interface. Swap sources without touching pipeline logic. |
96
+ | **Watermark-based ingestion** | Automatic incremental extraction — only records newer than the last saved watermark are fetched. No manual state tracking. |
97
+ | **Data contracts** | Schema enforcement at ingestion time: type validation, null checks, min/max bounds. Pipelines fail fast on bad data. |
98
+ | **Medallion pipelines** | Bronze stamps raw metadata; Silver deduplicates, casts types, and upserts. One `type: chain` config runs both in sequence. |
99
+ | **Multi-engine processing** | The same connector and config works with Spark (batch + structured streaming) and Flink (streaming-first). Switch with one string. |
100
+ | **Multi-storage lakehouse** | Iceberg (primary), Delta Lake, and Hudi are drop-in alternatives. Swap with `storage_writer_name`. |
101
+ | **Config-driven pipelines** | Everything — connector, engine, writer, watermark, contract — lives in one YAML file. No Python required for standard use cases. |
102
+ | **Multi-platform deployment** | One Helm chart targets EKS (Spark Operator), Databricks (Asset Bundles), and AWS EMR. Platform-specific details are injected by the chart, not the config. |
103
+
104
+ ---
105
+
106
+ ## Design principles
107
+
108
+ - **Engine agnostic** — Connectors yield plain Python dicts. They have no knowledge of Spark or Flink. Engines receive dicts and build their own DataFrames.
109
+ - **Storage agnostic** — Writers implement a single `write(df, table, mode)` interface. Iceberg, Delta, and Hudi are plug-in implementations.
110
+ - **Config-driven** — A complete pipeline — source, engine, transform, target, contract, schedule — is expressed as a YAML file. Pipeline logic lives in the framework, not in user code.
111
+ - **Medallion architecture** — Bronze is the canonical raw layer (immutable, metadata-stamped). Silver is always derived from Bronze, never from the source directly. This makes reprocessing safe and auditable.
112
+ - **Stateless pipelines** — Each `Pipeline.run()` creates fresh instances of connector, engine, and writer. No shared state between runs. Scheduled jobs can run in parallel safely.
113
+
114
+ ---
115
+
116
+ ## Data contracts
117
+
118
+ Every Bronze and Silver pipeline can declare a **data contract** — a schema specification that is validated before data is written. Violations either fail the pipeline or log a warning, depending on `on_violation`.
119
+
120
+ ```yaml
121
+ contract:
122
+ on_violation: fail # fail | warn
123
+ min_row_count: 1 # fail if fewer rows than this were ingested
124
+
125
+ columns:
126
+ - name: order_id
127
+ type: long
128
+ nullable: false # null check enforced
129
+
130
+ - name: amount
131
+ type: double
132
+ nullable: false
133
+ min_value: 0 # value range enforced
134
+
135
+ - name: created_at
136
+ type: timestamp
137
+ nullable: false
138
+
139
+ - name: status
140
+ type: string
141
+ nullable: false
142
+ allowed_values: [pending, confirmed, shipped, cancelled]
143
+ ```
144
+
145
+ ### What is validated
146
+
147
+ | Rule | YAML key | Description |
148
+ |---|---|---|
149
+ | Type check | `type` | Column must match the declared Spark/Iceberg type |
150
+ | Null check | `nullable: false` | No nulls allowed in this column |
151
+ | Min value | `min_value` | Rejects rows below this numeric threshold |
152
+ | Max value | `max_value` | Rejects rows above this numeric threshold |
153
+ | Allowed values | `allowed_values` | Enum constraint — only these values are valid |
154
+ | Row count | `min_row_count` | Minimum rows the batch must contain |
155
+
156
+ Contracts make data quality a first-class concern — not an afterthought in a downstream dbt model.
157
+
158
+ ---
159
+
160
+ ## Project structure
161
+
162
+ ```
163
+ killuhub/
164
+ ├── core/
165
+ │ ├── connector_interface.py # BaseConnector ABC
166
+ │ ├── engine_interface.py # BaseEngine ABC
167
+ │ ├── storage_interface.py # BaseStorageWriter ABC
168
+ │ ├── config.py # ConnectorConfig, PipelineConfig
169
+ │ ├── registry.py # Registry + default_registry singleton
170
+ │ ├── batch.py # BatchConfig, BatchMode, StreamingConfig
171
+ │ ├── contract.py # ContractSpec, ContractValidator
172
+ │ └── exceptions.py # KilluHubError hierarchy
173
+
174
+ ├── connectors/
175
+ │ ├── postgres/connector.py # Server-side cursor, incremental watermark
176
+ │ ├── mysql/connector.py # Streaming cursor, fetchmany, optional TLS
177
+ │ ├── kafka/connector.py # confluent-kafka, batch + stream modes
178
+ │ └── rest_api/connector.py # Page / cursor / offset pagination
179
+
180
+ ├── processing/
181
+ │ ├── spark_engine.py # PySpark 3.5 + Iceberg catalog wiring
182
+ │ └── flink_engine.py # PyFlink Table API + Iceberg catalog
183
+
184
+ ├── storage/
185
+ │ ├── iceberg/
186
+ │ │ ├── writer.py # append / overwrite / merge via MERGE INTO
187
+ │ │ └── schema_manager.py # schema evolution, time travel, compaction
188
+ │ ├── delta/writer.py # Delta Lake (drop-in alternative)
189
+ │ └── hudi/writer.py # Apache Hudi (drop-in alternative)
190
+
191
+ ├── ingestion/
192
+ │ ├── pipeline.py # Low-level connector → engine → writer loop
193
+ │ └── scheduler.py # APScheduler cron + interval jobs
194
+
195
+ └── layers/
196
+ ├── bronze/pipeline.py # BronzePipeline — metadata stamping + contract
197
+ ├── silver/pipeline.py # SilverPipeline — dedup, cast, partition, upsert
198
+ ├── streaming/pipeline.py # StreamingBronzePipeline — Spark Structured Streaming
199
+ └── gold/ # (roadmap)
200
+
201
+ config/
202
+ ├── bronze_postgres.yaml # Batch bronze from Postgres
203
+ ├── bronze_kafka.yaml # Streaming bronze from Kafka
204
+ ├── silver_orders.yaml # Silver from bronze
205
+ ├── chain_orders.yaml # Bronze + Silver in one file (Postgres)
206
+ └── chain_api_orders.yaml # Bronze + Silver in one file (REST API)
207
+
208
+ helm/killuhub/ # Helm chart — deploys to EKS / Databricks / EMR
209
+ ├── Chart.yaml
210
+ ├── values.yaml # Single user-facing interface
211
+ └── templates/
212
+ ├── configmap.yaml # Renders pipeline YAML — used on all platforms
213
+ ├── spark-application.yaml # Spark Operator CRD (EKS only)
214
+ ├── rbac.yaml # ServiceAccount + IRSA (EKS only)
215
+ ├── databricks-job.yaml # Databricks Asset Bundle (Databricks only)
216
+ └── emr-step.yaml # EMR step + cluster JSON (EMR only)
217
+
218
+ docs/
219
+ ├── core/core.md
220
+ ├── connectors/connectors.md
221
+ ├── processing/processing.md
222
+ ├── storage/storage.md
223
+ ├── ingestion/ingestion.md
224
+ ├── layers/layers.md
225
+ ├── helm/helm.md
226
+ └── usage/usage.md # End-to-end how-to for every scenario
227
+ ```
228
+
229
+ ---
230
+
231
+ ## Quick start
232
+
233
+ ### Install
234
+
235
+ ```bash
236
+ # Minimum install
237
+ pip install -e .
238
+
239
+ # With extras for your use case
240
+ pip install -e ".[postgres,spark,iceberg]"
241
+ pip install -e ".[kafka,spark,iceberg]"
242
+ pip install -e ".[mysql,spark,iceberg]"
243
+ pip install -e ".[all]"
244
+ ```
245
+
246
+ ### Bronze + Silver in one config (recommended)
247
+
248
+ The standard pattern is a **chain** config: one file runs bronze first, then silver automatically.
249
+ `silver.bronze_table` is auto-injected from the bronze stage — you never repeat the table name.
250
+
251
+ ```yaml
252
+ # config/chain_orders.yaml
253
+ type: chain
254
+
255
+ engine:
256
+ name: spark
257
+ warehouse: ${WAREHOUSE:-/tmp/killuhub-warehouse}
258
+ catalog_name: ${CATALOG:-local}
259
+ catalog_type: hadoop
260
+
261
+ stages:
262
+ - name: bronze-orders
263
+ type: bronze
264
+ mode: batch
265
+ batch:
266
+ strategy: incremental
267
+ watermark_column: updated_at
268
+ initial_watermark: "2024-01-01T00:00:00"
269
+ connector:
270
+ name: postgres
271
+ config:
272
+ host: ${PG_HOST:-localhost}
273
+ database: shop
274
+ user: postgres
275
+ password: ${PG_PASSWORD}
276
+ query: "SELECT * FROM orders"
277
+ bronze:
278
+ table: local.bronze.orders
279
+ source_name: postgres.shop.orders
280
+ partition_by: [_ingestion_date]
281
+
282
+ - name: silver-orders
283
+ type: silver
284
+ mode: batch
285
+ batch:
286
+ strategy: incremental
287
+ watermark_column: _ingested_at
288
+ initial_watermark: "1970-01-01T00:00:00"
289
+ silver:
290
+ # bronze_table is auto-injected from the bronze stage above
291
+ silver_table: local.silver.orders
292
+ key_columns: [order_id]
293
+ date_columns: [created_at, updated_at]
294
+ type_map: { amount: double, quantity: int }
295
+ null_check_columns: [order_id, customer_id]
296
+ partition_by: [created_date]
297
+ state_store: json
298
+ contract:
299
+ on_violation: fail
300
+ columns:
301
+ - { name: order_id, type: long, nullable: false }
302
+ - { name: amount, type: double, nullable: false, min_value: 0 }
303
+ ```
304
+
305
+ ```bash
306
+ python main.py --config config/chain_orders.yaml
307
+
308
+ # Dry-run — validate config without executing
309
+ python main.py --config config/chain_orders.yaml --dry-run
310
+ ```
311
+
312
+ ### Standalone bronze (batch)
313
+
314
+ ```yaml
315
+ # config/bronze_postgres.yaml
316
+ type: bronze
317
+ mode: batch
318
+ batch:
319
+ strategy: incremental
320
+ watermark_column: updated_at
321
+ initial_watermark: "2024-01-01T00:00:00"
322
+ connector:
323
+ name: postgres
324
+ config:
325
+ host: localhost
326
+ database: shop
327
+ user: postgres
328
+ password: ${PG_PASSWORD}
329
+ query: "SELECT * FROM orders"
330
+ engine:
331
+ name: spark
332
+ warehouse: /tmp/killuhub-warehouse
333
+ catalog_name: local
334
+ catalog_type: hadoop
335
+ bronze:
336
+ table: local.bronze.orders
337
+ source_name: postgres.shop.orders
338
+ partition_by: [_ingestion_date]
339
+ ```
340
+
341
+ ```bash
342
+ python main.py --config config/bronze_postgres.yaml
343
+ ```
344
+
345
+ ### Streaming bronze (Kafka)
346
+
347
+ ```yaml
348
+ type: bronze
349
+ mode: streaming
350
+ streaming:
351
+ trigger: processingTime
352
+ trigger_interval: "30 seconds"
353
+ checkpoint_location: ${CHECKPOINT_PATH:-/tmp/checkpoints}
354
+ output_mode: append
355
+ connector:
356
+ name: kafka
357
+ stream_format: kafka
358
+ stream_options:
359
+ kafka.bootstrap.servers: ${KAFKA_BROKERS:-localhost:9092}
360
+ subscribe: ${KAFKA_TOPIC:-orders}
361
+ startingOffsets: latest
362
+ engine:
363
+ name: spark
364
+ warehouse: /tmp/killuhub-warehouse
365
+ catalog_name: local
366
+ catalog_type: hadoop
367
+ bronze:
368
+ table: local.bronze.orders
369
+ source_name: kafka.orders
370
+ partition_by: [_ingestion_date]
371
+ ```
372
+
373
+ ### Custom connector
374
+
375
+ ```python
376
+ from killuhub.core import BaseConnector, ConnectorConfig, default_registry
377
+
378
+ class MongoConnector(BaseConnector):
379
+ def connect(self): ...
380
+ def extract(self): yield from my_collection.find()
381
+ def close(self): ...
382
+
383
+ default_registry.register_connector("mongo", MongoConnector)
384
+
385
+ # Now usable in any config: connector.name: mongo
386
+ ```
387
+
388
+ ---
389
+
390
+ ## Pipeline types
391
+
392
+ | Type | Description |
393
+ |------|-------------|
394
+ | `bronze` | Ingest raw data from a source, stamp metadata columns, write to Iceberg. Supports `batch` and `streaming` mode. |
395
+ | `silver` | Read from Bronze Iceberg table, deduplicate, cast types, add date dimensions, write to Silver Iceberg. Always `batch`. |
396
+ | `chain` | Run multiple stages in order (typically bronze → silver) from a single config file. |
397
+
398
+ ### Batch strategies
399
+
400
+ Both `bronze` and `silver` support:
401
+
402
+ | `batch.strategy` | Description |
403
+ |-----------------|-------------|
404
+ | `incremental` | Read only records newer than the last saved watermark. Efficient for daily/hourly runs. |
405
+ | `full` | Read everything from the source on every run. Use for small tables or full reprocessing. |
406
+
407
+ ---
408
+
409
+ ## Available connectors
410
+
411
+ | Connector | Source type | Supports incremental | Mode |
412
+ |------------|-------------|---------------------|------|
413
+ | `postgres` | PostgreSQL | Yes (watermark column) | batch |
414
+ | `mysql` | MySQL | Yes (watermark column) | batch |
415
+ | `kafka` | Kafka topic | Yes (offset tracking) | batch + streaming |
416
+ | `rest_api` | HTTP/REST | Yes (watermark column) | batch |
417
+
418
+ > S3 is a **destination** in KilluHub (Iceberg tables stored on S3), not a source connector.
419
+ > Data in S3 files is read directly by Spark via the engine's `warehouse` path.
420
+
421
+ ---
422
+
423
+ ## Storage format comparison
424
+
425
+ | Feature | Iceberg ✅ (primary) | Delta Lake | Hudi |
426
+ |--------------------------|----------------------|------------|------------|
427
+ | Multi-engine | Spark, Flink, Trino, Presto, Snowflake | Spark, Databricks-first | Spark, Flink |
428
+ | ACID transactions | Yes (v2) | Yes | Yes |
429
+ | Schema evolution | Yes | Yes | Yes |
430
+ | Time travel | Yes (snapshots) | Yes (versions) | Yes (timeline) |
431
+ | Hidden partitioning | Yes | No | No |
432
+ | Open format | Yes (no vendor lock) | Partially | Yes |
433
+ | Databricks compatible | Unity Catalog v2 | Native | Yes |
434
+ | Upsert strategy | MERGE INTO SQL | MERGE INTO | Copy-on-Write / Merge-on-Read |
435
+
436
+ **Iceberg is the primary choice** because it was designed for multi-engine access. Write with Spark today, query with Trino tomorrow, move to Databricks later — Iceberg handles it without migration.
437
+
438
+ ---
439
+
440
+ ## Bronze metadata columns
441
+
442
+ Every row written by `BronzePipeline` gets these columns stamped automatically:
443
+
444
+ | Column | Type | Description |
445
+ |--------|------|-------------|
446
+ | `_ingested_at` | TIMESTAMP | When this batch was written |
447
+ | `_source_name` | STRING | Human label from `bronze.source_name` |
448
+ | `_batch_id` | STRING | UUID for this run (idempotency key) |
449
+ | `_batch_mode` | STRING | `"full"` or `"incremental"` |
450
+ | `_ingestion_date` | DATE | Date partition column |
451
+
452
+ ---
453
+
454
+ ## Deployment
455
+
456
+ KilluHub runs anywhere via the Helm chart. Set `platform` and fill the matching section:
457
+
458
+ ```bash
459
+ # EKS (Spark Operator)
460
+ helm install killuhub-orders ./helm/killuhub \
461
+ -f helm/killuhub/values.yaml \
462
+ --set platform=eks \
463
+ --set pipeline.bronze.table=prod.bronze.orders
464
+
465
+ # Databricks
466
+ helm install killuhub-orders ./helm/killuhub \
467
+ -f helm/killuhub/values.yaml \
468
+ --set platform=databricks \
469
+ --set databricks.warehouse=s3://my-lake/warehouse
470
+
471
+ # AWS EMR
472
+ helm install killuhub-orders ./helm/killuhub \
473
+ -f helm/killuhub/values.yaml \
474
+ --set platform=emr \
475
+ --set emr.configBucket=s3://my-bucket/killuhub
476
+ ```
477
+
478
+ The chart injects the correct `engine:` block (catalog type, warehouse path, Unity Catalog name, etc.) into the rendered `pipeline.yaml` ConfigMap. Your pipeline config never contains platform-specific details.
479
+
480
+ ---
481
+
482
+ ## Roadmap
483
+
484
+ - [ ] Gold pipeline — aggregations, business metrics, serving layer
485
+ - [ ] CDC connectors — Debezium (Postgres/MySQL), DynamoDB Streams
486
+ - [ ] Schema registry integration — Avro/Protobuf deserialization from Confluent Schema Registry
487
+ - [ ] Iceberg compaction service — scheduled file compaction and snapshot expiry
488
+ - [ ] Data quality checks — row-level anomaly detection, freshness SLAs
489
+ - [ ] OpenTelemetry tracing — spans for each pipeline stage, exportable to Datadog/Grafana
490
+ - [ ] Great Expectations integration — contract DSL backed by GE expectation suites
491
+ - [ ] REST API for pipeline management — trigger, status, history via FastAPI
492
+
493
+ ---
494
+
495
+ ## Dependencies
496
+
497
+ | Extra | Package | Purpose |
498
+ |-------------|----------------------|--------------------------------|
499
+ | `postgres` | psycopg2-binary | PostgreSQL driver |
500
+ | `mysql` | mysql-connector-python | MySQL driver |
501
+ | `kafka` | confluent-kafka | Kafka consumer |
502
+ | `rest_api` | requests | HTTP client |
503
+ | `spark` | pyspark | Spark processing engine |
504
+ | `flink` | apache-flink | Flink processing engine |
505
+ | `iceberg` | pyiceberg | Iceberg Python client |
506
+ | `delta` | delta-spark | Delta Lake writer |
507
+ | `scheduler` | apscheduler | Cron + interval job scheduling |
508
+
509
+ ---
510
+
511
+ ## Study guides
512
+
513
+ - [Core layer](docs/core/core.md) — Abstract interfaces, Registry pattern, Config system
514
+ - [Connectors](docs/connectors/connectors.md) — Postgres, MySQL, Kafka, REST API internals
515
+ - [Processing engines](docs/processing/processing.md) — Spark and Flink, when to use each
516
+ - [Storage layer](docs/storage/storage.md) — Iceberg deep dive, Delta and Hudi comparison
517
+ - [Ingestion layer](docs/ingestion/ingestion.md) — Pipeline orchestration and Scheduler
518
+ - [Layers](docs/layers/layers.md) — Bronze and Silver pipeline internals
519
+ - [Helm chart](docs/helm/helm.md) — Multi-platform deployment
520
+ - [Usage guide](docs/usage/usage.md) — End-to-end how-to for every scenario