tech-hub-skills 1.5.1 ā 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/LICENSE +21 -21
- package/.claude/README.md +291 -291
- package/.claude/bin/cli.js +266 -266
- package/.claude/bin/copilot.js +182 -182
- package/.claude/bin/postinstall.js +42 -42
- package/.claude/commands/README.md +336 -336
- package/.claude/commands/ai-engineer.md +104 -104
- package/.claude/commands/aws.md +143 -143
- package/.claude/commands/azure.md +149 -149
- package/.claude/commands/backend-developer.md +108 -108
- package/.claude/commands/code-review.md +399 -399
- package/.claude/commands/compliance-automation.md +747 -747
- package/.claude/commands/compliance-officer.md +108 -108
- package/.claude/commands/data-engineer.md +113 -113
- package/.claude/commands/data-governance.md +102 -102
- package/.claude/commands/data-scientist.md +123 -123
- package/.claude/commands/database-admin.md +109 -109
- package/.claude/commands/devops.md +160 -160
- package/.claude/commands/docker.md +160 -160
- package/.claude/commands/enterprise-dashboard.md +613 -613
- package/.claude/commands/finops.md +184 -184
- package/.claude/commands/frontend-developer.md +108 -108
- package/.claude/commands/gcp.md +143 -143
- package/.claude/commands/ml-engineer.md +115 -115
- package/.claude/commands/mlops.md +187 -187
- package/.claude/commands/network-engineer.md +109 -109
- package/.claude/commands/optimization-advisor.md +329 -329
- package/.claude/commands/orchestrator.md +623 -623
- package/.claude/commands/platform-engineer.md +102 -102
- package/.claude/commands/process-automation.md +226 -226
- package/.claude/commands/process-changelog.md +184 -184
- package/.claude/commands/process-documentation.md +484 -484
- package/.claude/commands/process-kanban.md +324 -324
- package/.claude/commands/process-versioning.md +214 -214
- package/.claude/commands/product-designer.md +104 -104
- package/.claude/commands/project-starter.md +443 -443
- package/.claude/commands/qa-engineer.md +109 -109
- package/.claude/commands/security-architect.md +135 -135
- package/.claude/commands/sre.md +109 -109
- package/.claude/commands/system-design.md +126 -126
- package/.claude/commands/technical-writer.md +101 -101
- package/.claude/package.json +46 -46
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
- package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
- package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
- package/.claude/roles/azure/skills/05-functions/README.md +264 -264
- package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
- package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/.claude/roles/azure/skills/10-networking/README.md +264 -264
- package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
- package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
- package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
- package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
- package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
- package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
- package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
- package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
- package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
- package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
- package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
- package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
- package/.claude/skills/README.md +336 -336
- package/.claude/skills/ai-engineer.md +104 -104
- package/.claude/skills/aws.md +143 -143
- package/.claude/skills/azure.md +149 -149
- package/.claude/skills/backend-developer.md +108 -108
- package/.claude/skills/code-review.md +399 -399
- package/.claude/skills/compliance-automation.md +747 -747
- package/.claude/skills/compliance-officer.md +108 -108
- package/.claude/skills/data-engineer.md +113 -113
- package/.claude/skills/data-governance.md +102 -102
- package/.claude/skills/data-scientist.md +123 -123
- package/.claude/skills/database-admin.md +109 -109
- package/.claude/skills/devops.md +160 -160
- package/.claude/skills/docker.md +160 -160
- package/.claude/skills/enterprise-dashboard.md +613 -613
- package/.claude/skills/finops.md +184 -184
- package/.claude/skills/frontend-developer.md +108 -108
- package/.claude/skills/gcp.md +143 -143
- package/.claude/skills/ml-engineer.md +115 -115
- package/.claude/skills/mlops.md +187 -187
- package/.claude/skills/network-engineer.md +109 -109
- package/.claude/skills/optimization-advisor.md +329 -329
- package/.claude/skills/orchestrator.md +623 -623
- package/.claude/skills/platform-engineer.md +102 -102
- package/.claude/skills/process-automation.md +226 -226
- package/.claude/skills/process-changelog.md +184 -184
- package/.claude/skills/process-documentation.md +484 -484
- package/.claude/skills/process-kanban.md +324 -324
- package/.claude/skills/process-versioning.md +214 -214
- package/.claude/skills/product-designer.md +104 -104
- package/.claude/skills/project-starter.md +443 -443
- package/.claude/skills/qa-engineer.md +109 -109
- package/.claude/skills/security-architect.md +135 -135
- package/.claude/skills/sre.md +109 -109
- package/.claude/skills/system-design.md +126 -126
- package/.claude/skills/technical-writer.md +101 -101
- package/.gitattributes +2 -2
- package/GITHUB_COPILOT.md +106 -106
- package/README.md +192 -184
- package/package.json +16 -8
|
@@ -1,337 +1,337 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Bronze Layer: Raw Data Ingestion
|
|
3
|
-
Ingest data from multiple sources with validation and error handling.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import os
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Dict, Any, List, Optional, Union
|
|
11
|
-
import pandas as pd
|
|
12
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
13
|
-
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
|
|
14
|
-
from pyspark.sql import functions as F
|
|
15
|
-
import logging
|
|
16
|
-
|
|
17
|
-
logging.basicConfig(level=logging.INFO)
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class BronzeLoader:
|
|
22
|
-
"""
|
|
23
|
-
Bronze layer ingestion with schema validation and audit logging.
|
|
24
|
-
|
|
25
|
-
Bronze layer principles:
|
|
26
|
-
- Append-only (preserve full history)
|
|
27
|
-
- Raw data with minimal transformation
|
|
28
|
-
- Add metadata (ingestion timestamp, source, file name)
|
|
29
|
-
- Schema validation
|
|
30
|
-
- Error quarantine
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(
|
|
34
|
-
self,
|
|
35
|
-
spark: Optional[SparkSession] = None,
|
|
36
|
-
bronze_path: str = "/lakehouse/bronze",
|
|
37
|
-
quarantine_path: str = "/lakehouse/quarantine"
|
|
38
|
-
):
|
|
39
|
-
"""
|
|
40
|
-
Initialize Bronze loader.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
spark: SparkSession (creates one if not provided)
|
|
44
|
-
bronze_path: Path to bronze layer storage
|
|
45
|
-
quarantine_path: Path for invalid records
|
|
46
|
-
"""
|
|
47
|
-
self.spark = spark or self._create_spark_session()
|
|
48
|
-
self.bronze_path = bronze_path
|
|
49
|
-
self.quarantine_path = quarantine_path
|
|
50
|
-
|
|
51
|
-
# Create directories if they don't exist
|
|
52
|
-
Path(bronze_path).mkdir(parents=True, exist_ok=True)
|
|
53
|
-
Path(quarantine_path).mkdir(parents=True, exist_ok=True)
|
|
54
|
-
|
|
55
|
-
def _create_spark_session(self) -> SparkSession:
|
|
56
|
-
"""Create Spark session with Delta Lake support."""
|
|
57
|
-
return SparkSession.builder \
|
|
58
|
-
.appName("BronzeIngestion") \
|
|
59
|
-
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
|
|
60
|
-
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
|
|
61
|
-
.config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
|
|
62
|
-
.getOrCreate()
|
|
63
|
-
|
|
64
|
-
def ingest_from_source(
|
|
65
|
-
self,
|
|
66
|
-
source_path: str,
|
|
67
|
-
table_name: str,
|
|
68
|
-
source_format: str = "json",
|
|
69
|
-
schema: Optional[StructType] = None,
|
|
70
|
-
options: Optional[Dict[str, str]] = None
|
|
71
|
-
) -> Dict[str, Any]:
|
|
72
|
-
"""
|
|
73
|
-
Ingest data from source into Bronze layer.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
source_path: Path to source data
|
|
77
|
-
table_name: Name for bronze table
|
|
78
|
-
source_format: Format (json, csv, parquet, etc.)
|
|
79
|
-
schema: Optional schema to enforce
|
|
80
|
-
options: Additional read options
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
Ingestion metrics
|
|
84
|
-
"""
|
|
85
|
-
logger.info(f"Starting ingestion: {table_name} from {source_path}")
|
|
86
|
-
|
|
87
|
-
try:
|
|
88
|
-
# Read source data
|
|
89
|
-
df = self._read_source(source_path, source_format, schema, options)
|
|
90
|
-
|
|
91
|
-
# Add bronze layer metadata
|
|
92
|
-
df_bronze = self._add_bronze_metadata(df, source_path, table_name)
|
|
93
|
-
|
|
94
|
-
# Validate schema if provided
|
|
95
|
-
if schema:
|
|
96
|
-
df_bronze = self._validate_schema(df_bronze, schema)
|
|
97
|
-
|
|
98
|
-
# Write to bronze layer
|
|
99
|
-
bronze_table_path = f"{self.bronze_path}/{table_name}"
|
|
100
|
-
|
|
101
|
-
df_bronze.write \
|
|
102
|
-
.format("delta") \
|
|
103
|
-
.mode("append") \
|
|
104
|
-
.option("mergeSchema", "true") \
|
|
105
|
-
.save(bronze_table_path)
|
|
106
|
-
|
|
107
|
-
# Collect metrics
|
|
108
|
-
record_count = df_bronze.count()
|
|
109
|
-
|
|
110
|
-
metrics = {
|
|
111
|
-
"status": "success",
|
|
112
|
-
"table_name": table_name,
|
|
113
|
-
"records_ingested": record_count,
|
|
114
|
-
"source_path": source_path,
|
|
115
|
-
"ingestion_timestamp": datetime.now().isoformat(),
|
|
116
|
-
"bronze_path": bronze_table_path
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
logger.info(f"ā
Successfully ingested {record_count} records to {table_name}")
|
|
120
|
-
|
|
121
|
-
return metrics
|
|
122
|
-
|
|
123
|
-
except Exception as e:
|
|
124
|
-
logger.error(f"ā Ingestion failed: {str(e)}")
|
|
125
|
-
|
|
126
|
-
return {
|
|
127
|
-
"status": "failed",
|
|
128
|
-
"table_name": table_name,
|
|
129
|
-
"error": str(e),
|
|
130
|
-
"ingestion_timestamp": datetime.now().isoformat()
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
def _read_source(
|
|
134
|
-
self,
|
|
135
|
-
source_path: str,
|
|
136
|
-
source_format: str,
|
|
137
|
-
schema: Optional[StructType] = None,
|
|
138
|
-
options: Optional[Dict[str, str]] = None
|
|
139
|
-
) -> DataFrame:
|
|
140
|
-
"""Read data from source."""
|
|
141
|
-
options = options or {}
|
|
142
|
-
|
|
143
|
-
reader = self.spark.read.format(source_format)
|
|
144
|
-
|
|
145
|
-
if schema:
|
|
146
|
-
reader = reader.schema(schema)
|
|
147
|
-
|
|
148
|
-
for key, value in options.items():
|
|
149
|
-
reader = reader.option(key, value)
|
|
150
|
-
|
|
151
|
-
return reader.load(source_path)
|
|
152
|
-
|
|
153
|
-
def _add_bronze_metadata(
|
|
154
|
-
self,
|
|
155
|
-
df: DataFrame,
|
|
156
|
-
source_path: str,
|
|
157
|
-
table_name: str
|
|
158
|
-
) -> DataFrame:
|
|
159
|
-
"""Add bronze layer audit columns."""
|
|
160
|
-
return df \
|
|
161
|
-
.withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
|
|
162
|
-
.withColumn("_bronze_source_path", F.lit(source_path)) \
|
|
163
|
-
.withColumn("_bronze_table_name", F.lit(table_name)) \
|
|
164
|
-
.withColumn("_bronze_ingestion_date", F.current_date())
|
|
165
|
-
|
|
166
|
-
def _validate_schema(
|
|
167
|
-
self,
|
|
168
|
-
df: DataFrame,
|
|
169
|
-
expected_schema: StructType
|
|
170
|
-
) -> DataFrame:
|
|
171
|
-
"""
|
|
172
|
-
Validate DataFrame against expected schema.
|
|
173
|
-
|
|
174
|
-
Quarantine records that don't match schema.
|
|
175
|
-
"""
|
|
176
|
-
# In production, implement sophisticated schema validation
|
|
177
|
-
# For now, we return the df as-is
|
|
178
|
-
return df
|
|
179
|
-
|
|
180
|
-
def ingest_csv(
|
|
181
|
-
self,
|
|
182
|
-
csv_path: str,
|
|
183
|
-
table_name: str,
|
|
184
|
-
delimiter: str = ",",
|
|
185
|
-
header: bool = True,
|
|
186
|
-
schema: Optional[StructType] = None
|
|
187
|
-
) -> Dict[str, Any]:
|
|
188
|
-
"""Convenience method for CSV ingestion."""
|
|
189
|
-
options = {
|
|
190
|
-
"delimiter": delimiter,
|
|
191
|
-
"header": str(header).lower(),
|
|
192
|
-
"inferSchema": "true" if schema is None else "false"
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
return self.ingest_from_source(
|
|
196
|
-
source_path=csv_path,
|
|
197
|
-
table_name=table_name,
|
|
198
|
-
source_format="csv",
|
|
199
|
-
schema=schema,
|
|
200
|
-
options=options
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
def ingest_json(
|
|
204
|
-
self,
|
|
205
|
-
json_path: str,
|
|
206
|
-
table_name: str,
|
|
207
|
-
multiline: bool = False,
|
|
208
|
-
schema: Optional[StructType] = None
|
|
209
|
-
) -> Dict[str, Any]:
|
|
210
|
-
"""Convenience method for JSON ingestion."""
|
|
211
|
-
options = {
|
|
212
|
-
"multiLine": str(multiline).lower()
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
return self.ingest_from_source(
|
|
216
|
-
source_path=json_path,
|
|
217
|
-
table_name=table_name,
|
|
218
|
-
source_format="json",
|
|
219
|
-
schema=schema,
|
|
220
|
-
options=options
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
def ingest_parquet(
|
|
224
|
-
self,
|
|
225
|
-
parquet_path: str,
|
|
226
|
-
table_name: str
|
|
227
|
-
) -> Dict[str, Any]:
|
|
228
|
-
"""Convenience method for Parquet ingestion."""
|
|
229
|
-
return self.ingest_from_source(
|
|
230
|
-
source_path=parquet_path,
|
|
231
|
-
table_name=table_name,
|
|
232
|
-
source_format="parquet"
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
def create_bronze_table(
|
|
236
|
-
self,
|
|
237
|
-
table_name: str,
|
|
238
|
-
schema: StructType,
|
|
239
|
-
partition_by: Optional[List[str]] = None
|
|
240
|
-
) -> None:
|
|
241
|
-
"""Create an empty bronze table with schema."""
|
|
242
|
-
bronze_table_path = f"{self.bronze_path}/{table_name}"
|
|
243
|
-
|
|
244
|
-
# Create empty DataFrame with schema
|
|
245
|
-
empty_df = self.spark.createDataFrame([], schema)
|
|
246
|
-
|
|
247
|
-
# Add bronze metadata columns
|
|
248
|
-
bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
|
|
249
|
-
|
|
250
|
-
# Write table
|
|
251
|
-
writer = bronze_df.write.format("delta").mode("overwrite")
|
|
252
|
-
|
|
253
|
-
if partition_by:
|
|
254
|
-
writer = writer.partitionBy(*partition_by)
|
|
255
|
-
|
|
256
|
-
writer.save(bronze_table_path)
|
|
257
|
-
|
|
258
|
-
logger.info(f"ā
Created bronze table: {table_name}")
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
# Example CRM schema
|
|
262
|
-
CRM_LEADS_SCHEMA = StructType([
|
|
263
|
-
StructField("lead_id", StringType(), False),
|
|
264
|
-
StructField("email", StringType(), True),
|
|
265
|
-
StructField("company", StringType(), True),
|
|
266
|
-
StructField("industry", StringType(), True),
|
|
267
|
-
StructField("company_size", StringType(), True),
|
|
268
|
-
StructField("job_title", StringType(), True),
|
|
269
|
-
StructField("lead_source", StringType(), True),
|
|
270
|
-
StructField("created_date", TimestampType(), True),
|
|
271
|
-
StructField("lead_score", IntegerType(), True),
|
|
272
|
-
StructField("status", StringType(), True)
|
|
273
|
-
])
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
# Example usage
|
|
277
|
-
if __name__ == "__main__":
|
|
278
|
-
print("=" * 80)
|
|
279
|
-
print("Bronze Layer Ingestion Demo")
|
|
280
|
-
print("=" * 80)
|
|
281
|
-
|
|
282
|
-
# Create sample data
|
|
283
|
-
sample_data = [
|
|
284
|
-
{
|
|
285
|
-
"lead_id": "L001",
|
|
286
|
-
"email": "john@techcorp.com",
|
|
287
|
-
"company": "TechCorp",
|
|
288
|
-
"industry": "Software",
|
|
289
|
-
"company_size": "100-500",
|
|
290
|
-
"job_title": "Data Scientist",
|
|
291
|
-
"lead_source": "Website",
|
|
292
|
-
"created_date": "2025-01-15T10:30:00",
|
|
293
|
-
"lead_score": 85,
|
|
294
|
-
"status": "New"
|
|
295
|
-
},
|
|
296
|
-
{
|
|
297
|
-
"lead_id": "L002",
|
|
298
|
-
"email": "sarah@datainc.com",
|
|
299
|
-
"company": "Data Inc",
|
|
300
|
-
"industry": "Analytics",
|
|
301
|
-
"company_size": "50-100",
|
|
302
|
-
"job_title": "ML Engineer",
|
|
303
|
-
"lead_source": "LinkedIn",
|
|
304
|
-
"created_date": "2025-01-16T14:20:00",
|
|
305
|
-
"lead_score": 92,
|
|
306
|
-
"status": "Qualified"
|
|
307
|
-
}
|
|
308
|
-
]
|
|
309
|
-
|
|
310
|
-
# Save as JSON
|
|
311
|
-
sample_path = "/tmp/sample_crm_leads.json"
|
|
312
|
-
with open(sample_path, 'w') as f:
|
|
313
|
-
json.dump(sample_data, f)
|
|
314
|
-
|
|
315
|
-
# Initialize Bronze loader
|
|
316
|
-
bronze = BronzeLoader(
|
|
317
|
-
bronze_path="./lakehouse/bronze",
|
|
318
|
-
quarantine_path="./lakehouse/quarantine"
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
# Ingest data
|
|
322
|
-
metrics = bronze.ingest_json(
|
|
323
|
-
json_path=sample_path,
|
|
324
|
-
table_name="crm_leads",
|
|
325
|
-
multiline=True,
|
|
326
|
-
schema=CRM_LEADS_SCHEMA
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
print("\nš Ingestion Metrics:")
|
|
330
|
-
print(json.dumps(metrics, indent=2))
|
|
331
|
-
|
|
332
|
-
# Query bronze table
|
|
333
|
-
print("\nš Bronze Table Sample:")
|
|
334
|
-
bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
|
|
335
|
-
bronze_df.show(truncate=False)
|
|
336
|
-
|
|
337
|
-
print(f"\nBronze table row count: {bronze_df.count()}")
|
|
1
|
+
"""
|
|
2
|
+
Bronze Layer: Raw Data Ingestion
|
|
3
|
+
Ingest data from multiple sources with validation and error handling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Any, List, Optional, Union
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
13
|
+
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
|
|
14
|
+
from pyspark.sql import functions as F
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
logging.basicConfig(level=logging.INFO)
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BronzeLoader:
|
|
22
|
+
"""
|
|
23
|
+
Bronze layer ingestion with schema validation and audit logging.
|
|
24
|
+
|
|
25
|
+
Bronze layer principles:
|
|
26
|
+
- Append-only (preserve full history)
|
|
27
|
+
- Raw data with minimal transformation
|
|
28
|
+
- Add metadata (ingestion timestamp, source, file name)
|
|
29
|
+
- Schema validation
|
|
30
|
+
- Error quarantine
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
spark: Optional[SparkSession] = None,
|
|
36
|
+
bronze_path: str = "/lakehouse/bronze",
|
|
37
|
+
quarantine_path: str = "/lakehouse/quarantine"
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize Bronze loader.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
spark: SparkSession (creates one if not provided)
|
|
44
|
+
bronze_path: Path to bronze layer storage
|
|
45
|
+
quarantine_path: Path for invalid records
|
|
46
|
+
"""
|
|
47
|
+
self.spark = spark or self._create_spark_session()
|
|
48
|
+
self.bronze_path = bronze_path
|
|
49
|
+
self.quarantine_path = quarantine_path
|
|
50
|
+
|
|
51
|
+
# Create directories if they don't exist
|
|
52
|
+
Path(bronze_path).mkdir(parents=True, exist_ok=True)
|
|
53
|
+
Path(quarantine_path).mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
def _create_spark_session(self) -> SparkSession:
|
|
56
|
+
"""Create Spark session with Delta Lake support."""
|
|
57
|
+
return SparkSession.builder \
|
|
58
|
+
.appName("BronzeIngestion") \
|
|
59
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
|
|
60
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
|
|
61
|
+
.config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
|
|
62
|
+
.getOrCreate()
|
|
63
|
+
|
|
64
|
+
def ingest_from_source(
|
|
65
|
+
self,
|
|
66
|
+
source_path: str,
|
|
67
|
+
table_name: str,
|
|
68
|
+
source_format: str = "json",
|
|
69
|
+
schema: Optional[StructType] = None,
|
|
70
|
+
options: Optional[Dict[str, str]] = None
|
|
71
|
+
) -> Dict[str, Any]:
|
|
72
|
+
"""
|
|
73
|
+
Ingest data from source into Bronze layer.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
source_path: Path to source data
|
|
77
|
+
table_name: Name for bronze table
|
|
78
|
+
source_format: Format (json, csv, parquet, etc.)
|
|
79
|
+
schema: Optional schema to enforce
|
|
80
|
+
options: Additional read options
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Ingestion metrics
|
|
84
|
+
"""
|
|
85
|
+
logger.info(f"Starting ingestion: {table_name} from {source_path}")
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# Read source data
|
|
89
|
+
df = self._read_source(source_path, source_format, schema, options)
|
|
90
|
+
|
|
91
|
+
# Add bronze layer metadata
|
|
92
|
+
df_bronze = self._add_bronze_metadata(df, source_path, table_name)
|
|
93
|
+
|
|
94
|
+
# Validate schema if provided
|
|
95
|
+
if schema:
|
|
96
|
+
df_bronze = self._validate_schema(df_bronze, schema)
|
|
97
|
+
|
|
98
|
+
# Write to bronze layer
|
|
99
|
+
bronze_table_path = f"{self.bronze_path}/{table_name}"
|
|
100
|
+
|
|
101
|
+
df_bronze.write \
|
|
102
|
+
.format("delta") \
|
|
103
|
+
.mode("append") \
|
|
104
|
+
.option("mergeSchema", "true") \
|
|
105
|
+
.save(bronze_table_path)
|
|
106
|
+
|
|
107
|
+
# Collect metrics
|
|
108
|
+
record_count = df_bronze.count()
|
|
109
|
+
|
|
110
|
+
metrics = {
|
|
111
|
+
"status": "success",
|
|
112
|
+
"table_name": table_name,
|
|
113
|
+
"records_ingested": record_count,
|
|
114
|
+
"source_path": source_path,
|
|
115
|
+
"ingestion_timestamp": datetime.now().isoformat(),
|
|
116
|
+
"bronze_path": bronze_table_path
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
logger.info(f"ā
Successfully ingested {record_count} records to {table_name}")
|
|
120
|
+
|
|
121
|
+
return metrics
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"ā Ingestion failed: {str(e)}")
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"status": "failed",
|
|
128
|
+
"table_name": table_name,
|
|
129
|
+
"error": str(e),
|
|
130
|
+
"ingestion_timestamp": datetime.now().isoformat()
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def _read_source(
|
|
134
|
+
self,
|
|
135
|
+
source_path: str,
|
|
136
|
+
source_format: str,
|
|
137
|
+
schema: Optional[StructType] = None,
|
|
138
|
+
options: Optional[Dict[str, str]] = None
|
|
139
|
+
) -> DataFrame:
|
|
140
|
+
"""Read data from source."""
|
|
141
|
+
options = options or {}
|
|
142
|
+
|
|
143
|
+
reader = self.spark.read.format(source_format)
|
|
144
|
+
|
|
145
|
+
if schema:
|
|
146
|
+
reader = reader.schema(schema)
|
|
147
|
+
|
|
148
|
+
for key, value in options.items():
|
|
149
|
+
reader = reader.option(key, value)
|
|
150
|
+
|
|
151
|
+
return reader.load(source_path)
|
|
152
|
+
|
|
153
|
+
def _add_bronze_metadata(
|
|
154
|
+
self,
|
|
155
|
+
df: DataFrame,
|
|
156
|
+
source_path: str,
|
|
157
|
+
table_name: str
|
|
158
|
+
) -> DataFrame:
|
|
159
|
+
"""Add bronze layer audit columns."""
|
|
160
|
+
return df \
|
|
161
|
+
.withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
|
|
162
|
+
.withColumn("_bronze_source_path", F.lit(source_path)) \
|
|
163
|
+
.withColumn("_bronze_table_name", F.lit(table_name)) \
|
|
164
|
+
.withColumn("_bronze_ingestion_date", F.current_date())
|
|
165
|
+
|
|
166
|
+
def _validate_schema(
|
|
167
|
+
self,
|
|
168
|
+
df: DataFrame,
|
|
169
|
+
expected_schema: StructType
|
|
170
|
+
) -> DataFrame:
|
|
171
|
+
"""
|
|
172
|
+
Validate DataFrame against expected schema.
|
|
173
|
+
|
|
174
|
+
Quarantine records that don't match schema.
|
|
175
|
+
"""
|
|
176
|
+
# In production, implement sophisticated schema validation
|
|
177
|
+
# For now, we return the df as-is
|
|
178
|
+
return df
|
|
179
|
+
|
|
180
|
+
def ingest_csv(
|
|
181
|
+
self,
|
|
182
|
+
csv_path: str,
|
|
183
|
+
table_name: str,
|
|
184
|
+
delimiter: str = ",",
|
|
185
|
+
header: bool = True,
|
|
186
|
+
schema: Optional[StructType] = None
|
|
187
|
+
) -> Dict[str, Any]:
|
|
188
|
+
"""Convenience method for CSV ingestion."""
|
|
189
|
+
options = {
|
|
190
|
+
"delimiter": delimiter,
|
|
191
|
+
"header": str(header).lower(),
|
|
192
|
+
"inferSchema": "true" if schema is None else "false"
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return self.ingest_from_source(
|
|
196
|
+
source_path=csv_path,
|
|
197
|
+
table_name=table_name,
|
|
198
|
+
source_format="csv",
|
|
199
|
+
schema=schema,
|
|
200
|
+
options=options
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def ingest_json(
|
|
204
|
+
self,
|
|
205
|
+
json_path: str,
|
|
206
|
+
table_name: str,
|
|
207
|
+
multiline: bool = False,
|
|
208
|
+
schema: Optional[StructType] = None
|
|
209
|
+
) -> Dict[str, Any]:
|
|
210
|
+
"""Convenience method for JSON ingestion."""
|
|
211
|
+
options = {
|
|
212
|
+
"multiLine": str(multiline).lower()
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return self.ingest_from_source(
|
|
216
|
+
source_path=json_path,
|
|
217
|
+
table_name=table_name,
|
|
218
|
+
source_format="json",
|
|
219
|
+
schema=schema,
|
|
220
|
+
options=options
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def ingest_parquet(
|
|
224
|
+
self,
|
|
225
|
+
parquet_path: str,
|
|
226
|
+
table_name: str
|
|
227
|
+
) -> Dict[str, Any]:
|
|
228
|
+
"""Convenience method for Parquet ingestion."""
|
|
229
|
+
return self.ingest_from_source(
|
|
230
|
+
source_path=parquet_path,
|
|
231
|
+
table_name=table_name,
|
|
232
|
+
source_format="parquet"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def create_bronze_table(
|
|
236
|
+
self,
|
|
237
|
+
table_name: str,
|
|
238
|
+
schema: StructType,
|
|
239
|
+
partition_by: Optional[List[str]] = None
|
|
240
|
+
) -> None:
|
|
241
|
+
"""Create an empty bronze table with schema."""
|
|
242
|
+
bronze_table_path = f"{self.bronze_path}/{table_name}"
|
|
243
|
+
|
|
244
|
+
# Create empty DataFrame with schema
|
|
245
|
+
empty_df = self.spark.createDataFrame([], schema)
|
|
246
|
+
|
|
247
|
+
# Add bronze metadata columns
|
|
248
|
+
bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
|
|
249
|
+
|
|
250
|
+
# Write table
|
|
251
|
+
writer = bronze_df.write.format("delta").mode("overwrite")
|
|
252
|
+
|
|
253
|
+
if partition_by:
|
|
254
|
+
writer = writer.partitionBy(*partition_by)
|
|
255
|
+
|
|
256
|
+
writer.save(bronze_table_path)
|
|
257
|
+
|
|
258
|
+
logger.info(f"ā
Created bronze table: {table_name}")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Example CRM schema
|
|
262
|
+
CRM_LEADS_SCHEMA = StructType([
|
|
263
|
+
StructField("lead_id", StringType(), False),
|
|
264
|
+
StructField("email", StringType(), True),
|
|
265
|
+
StructField("company", StringType(), True),
|
|
266
|
+
StructField("industry", StringType(), True),
|
|
267
|
+
StructField("company_size", StringType(), True),
|
|
268
|
+
StructField("job_title", StringType(), True),
|
|
269
|
+
StructField("lead_source", StringType(), True),
|
|
270
|
+
StructField("created_date", TimestampType(), True),
|
|
271
|
+
StructField("lead_score", IntegerType(), True),
|
|
272
|
+
StructField("status", StringType(), True)
|
|
273
|
+
])
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# Example usage
|
|
277
|
+
if __name__ == "__main__":
|
|
278
|
+
print("=" * 80)
|
|
279
|
+
print("Bronze Layer Ingestion Demo")
|
|
280
|
+
print("=" * 80)
|
|
281
|
+
|
|
282
|
+
# Create sample data
|
|
283
|
+
sample_data = [
|
|
284
|
+
{
|
|
285
|
+
"lead_id": "L001",
|
|
286
|
+
"email": "john@techcorp.com",
|
|
287
|
+
"company": "TechCorp",
|
|
288
|
+
"industry": "Software",
|
|
289
|
+
"company_size": "100-500",
|
|
290
|
+
"job_title": "Data Scientist",
|
|
291
|
+
"lead_source": "Website",
|
|
292
|
+
"created_date": "2025-01-15T10:30:00",
|
|
293
|
+
"lead_score": 85,
|
|
294
|
+
"status": "New"
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
"lead_id": "L002",
|
|
298
|
+
"email": "sarah@datainc.com",
|
|
299
|
+
"company": "Data Inc",
|
|
300
|
+
"industry": "Analytics",
|
|
301
|
+
"company_size": "50-100",
|
|
302
|
+
"job_title": "ML Engineer",
|
|
303
|
+
"lead_source": "LinkedIn",
|
|
304
|
+
"created_date": "2025-01-16T14:20:00",
|
|
305
|
+
"lead_score": 92,
|
|
306
|
+
"status": "Qualified"
|
|
307
|
+
}
|
|
308
|
+
]
|
|
309
|
+
|
|
310
|
+
# Save as JSON
|
|
311
|
+
sample_path = "/tmp/sample_crm_leads.json"
|
|
312
|
+
with open(sample_path, 'w') as f:
|
|
313
|
+
json.dump(sample_data, f)
|
|
314
|
+
|
|
315
|
+
# Initialize Bronze loader
|
|
316
|
+
bronze = BronzeLoader(
|
|
317
|
+
bronze_path="./lakehouse/bronze",
|
|
318
|
+
quarantine_path="./lakehouse/quarantine"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Ingest data
|
|
322
|
+
metrics = bronze.ingest_json(
|
|
323
|
+
json_path=sample_path,
|
|
324
|
+
table_name="crm_leads",
|
|
325
|
+
multiline=True,
|
|
326
|
+
schema=CRM_LEADS_SCHEMA
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
print("\nš Ingestion Metrics:")
|
|
330
|
+
print(json.dumps(metrics, indent=2))
|
|
331
|
+
|
|
332
|
+
# Query bronze table
|
|
333
|
+
print("\nš Bronze Table Sample:")
|
|
334
|
+
bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
|
|
335
|
+
bronze_df.show(truncate=False)
|
|
336
|
+
|
|
337
|
+
print(f"\nBronze table row count: {bronze_df.count()}")
|