tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -0,0 +1,337 @@
1
+ """
2
+ Bronze Layer: Raw Data Ingestion
3
+ Ingest data from multiple sources with validation and error handling.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Dict, Any, List, Optional, Union
11
+ import pandas as pd
12
+ from pyspark.sql import SparkSession, DataFrame
13
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
14
+ from pyspark.sql import functions as F
15
+ import logging
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BronzeLoader:
22
+ """
23
+ Bronze layer ingestion with schema validation and audit logging.
24
+
25
+ Bronze layer principles:
26
+ - Append-only (preserve full history)
27
+ - Raw data with minimal transformation
28
+ - Add metadata (ingestion timestamp, source, file name)
29
+ - Schema validation
30
+ - Error quarantine
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ spark: Optional[SparkSession] = None,
36
+ bronze_path: str = "/lakehouse/bronze",
37
+ quarantine_path: str = "/lakehouse/quarantine"
38
+ ):
39
+ """
40
+ Initialize Bronze loader.
41
+
42
+ Args:
43
+ spark: SparkSession (creates one if not provided)
44
+ bronze_path: Path to bronze layer storage
45
+ quarantine_path: Path for invalid records
46
+ """
47
+ self.spark = spark or self._create_spark_session()
48
+ self.bronze_path = bronze_path
49
+ self.quarantine_path = quarantine_path
50
+
51
+ # Create directories if they don't exist
52
+ Path(bronze_path).mkdir(parents=True, exist_ok=True)
53
+ Path(quarantine_path).mkdir(parents=True, exist_ok=True)
54
+
55
+ def _create_spark_session(self) -> SparkSession:
56
+ """Create Spark session with Delta Lake support."""
57
+ return SparkSession.builder \
58
+ .appName("BronzeIngestion") \
59
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
60
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
61
+ .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
62
+ .getOrCreate()
63
+
64
+ def ingest_from_source(
65
+ self,
66
+ source_path: str,
67
+ table_name: str,
68
+ source_format: str = "json",
69
+ schema: Optional[StructType] = None,
70
+ options: Optional[Dict[str, str]] = None
71
+ ) -> Dict[str, Any]:
72
+ """
73
+ Ingest data from source into Bronze layer.
74
+
75
+ Args:
76
+ source_path: Path to source data
77
+ table_name: Name for bronze table
78
+ source_format: Format (json, csv, parquet, etc.)
79
+ schema: Optional schema to enforce
80
+ options: Additional read options
81
+
82
+ Returns:
83
+ Ingestion metrics
84
+ """
85
+ logger.info(f"Starting ingestion: {table_name} from {source_path}")
86
+
87
+ try:
88
+ # Read source data
89
+ df = self._read_source(source_path, source_format, schema, options)
90
+
91
+ # Add bronze layer metadata
92
+ df_bronze = self._add_bronze_metadata(df, source_path, table_name)
93
+
94
+ # Validate schema if provided
95
+ if schema:
96
+ df_bronze = self._validate_schema(df_bronze, schema)
97
+
98
+ # Write to bronze layer
99
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
100
+
101
+ df_bronze.write \
102
+ .format("delta") \
103
+ .mode("append") \
104
+ .option("mergeSchema", "true") \
105
+ .save(bronze_table_path)
106
+
107
+ # Collect metrics
108
+ record_count = df_bronze.count()
109
+
110
+ metrics = {
111
+ "status": "success",
112
+ "table_name": table_name,
113
+ "records_ingested": record_count,
114
+ "source_path": source_path,
115
+ "ingestion_timestamp": datetime.now().isoformat(),
116
+ "bronze_path": bronze_table_path
117
+ }
118
+
119
+ logger.info(f"✅ Successfully ingested {record_count} records to {table_name}")
120
+
121
+ return metrics
122
+
123
+ except Exception as e:
124
+ logger.error(f"❌ Ingestion failed: {str(e)}")
125
+
126
+ return {
127
+ "status": "failed",
128
+ "table_name": table_name,
129
+ "error": str(e),
130
+ "ingestion_timestamp": datetime.now().isoformat()
131
+ }
132
+
133
+ def _read_source(
134
+ self,
135
+ source_path: str,
136
+ source_format: str,
137
+ schema: Optional[StructType] = None,
138
+ options: Optional[Dict[str, str]] = None
139
+ ) -> DataFrame:
140
+ """Read data from source."""
141
+ options = options or {}
142
+
143
+ reader = self.spark.read.format(source_format)
144
+
145
+ if schema:
146
+ reader = reader.schema(schema)
147
+
148
+ for key, value in options.items():
149
+ reader = reader.option(key, value)
150
+
151
+ return reader.load(source_path)
152
+
153
+ def _add_bronze_metadata(
154
+ self,
155
+ df: DataFrame,
156
+ source_path: str,
157
+ table_name: str
158
+ ) -> DataFrame:
159
+ """Add bronze layer audit columns."""
160
+ return df \
161
+ .withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
162
+ .withColumn("_bronze_source_path", F.lit(source_path)) \
163
+ .withColumn("_bronze_table_name", F.lit(table_name)) \
164
+ .withColumn("_bronze_ingestion_date", F.current_date())
165
+
166
+ def _validate_schema(
167
+ self,
168
+ df: DataFrame,
169
+ expected_schema: StructType
170
+ ) -> DataFrame:
171
+ """
172
+ Validate DataFrame against expected schema.
173
+
174
+ Quarantine records that don't match schema.
175
+ """
176
+ # In production, implement sophisticated schema validation
177
+ # For now, we return the df as-is
178
+ return df
179
+
180
+ def ingest_csv(
181
+ self,
182
+ csv_path: str,
183
+ table_name: str,
184
+ delimiter: str = ",",
185
+ header: bool = True,
186
+ schema: Optional[StructType] = None
187
+ ) -> Dict[str, Any]:
188
+ """Convenience method for CSV ingestion."""
189
+ options = {
190
+ "delimiter": delimiter,
191
+ "header": str(header).lower(),
192
+ "inferSchema": "true" if schema is None else "false"
193
+ }
194
+
195
+ return self.ingest_from_source(
196
+ source_path=csv_path,
197
+ table_name=table_name,
198
+ source_format="csv",
199
+ schema=schema,
200
+ options=options
201
+ )
202
+
203
+ def ingest_json(
204
+ self,
205
+ json_path: str,
206
+ table_name: str,
207
+ multiline: bool = False,
208
+ schema: Optional[StructType] = None
209
+ ) -> Dict[str, Any]:
210
+ """Convenience method for JSON ingestion."""
211
+ options = {
212
+ "multiLine": str(multiline).lower()
213
+ }
214
+
215
+ return self.ingest_from_source(
216
+ source_path=json_path,
217
+ table_name=table_name,
218
+ source_format="json",
219
+ schema=schema,
220
+ options=options
221
+ )
222
+
223
+ def ingest_parquet(
224
+ self,
225
+ parquet_path: str,
226
+ table_name: str
227
+ ) -> Dict[str, Any]:
228
+ """Convenience method for Parquet ingestion."""
229
+ return self.ingest_from_source(
230
+ source_path=parquet_path,
231
+ table_name=table_name,
232
+ source_format="parquet"
233
+ )
234
+
235
+ def create_bronze_table(
236
+ self,
237
+ table_name: str,
238
+ schema: StructType,
239
+ partition_by: Optional[List[str]] = None
240
+ ) -> None:
241
+ """Create an empty bronze table with schema."""
242
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
243
+
244
+ # Create empty DataFrame with schema
245
+ empty_df = self.spark.createDataFrame([], schema)
246
+
247
+ # Add bronze metadata columns
248
+ bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
249
+
250
+ # Write table
251
+ writer = bronze_df.write.format("delta").mode("overwrite")
252
+
253
+ if partition_by:
254
+ writer = writer.partitionBy(*partition_by)
255
+
256
+ writer.save(bronze_table_path)
257
+
258
+ logger.info(f"✅ Created bronze table: {table_name}")
259
+
260
+
261
+ # Example CRM schema
262
+ CRM_LEADS_SCHEMA = StructType([
263
+ StructField("lead_id", StringType(), False),
264
+ StructField("email", StringType(), True),
265
+ StructField("company", StringType(), True),
266
+ StructField("industry", StringType(), True),
267
+ StructField("company_size", StringType(), True),
268
+ StructField("job_title", StringType(), True),
269
+ StructField("lead_source", StringType(), True),
270
+ StructField("created_date", TimestampType(), True),
271
+ StructField("lead_score", IntegerType(), True),
272
+ StructField("status", StringType(), True)
273
+ ])
274
+
275
+
276
+ # Example usage
277
+ if __name__ == "__main__":
278
+ print("=" * 80)
279
+ print("Bronze Layer Ingestion Demo")
280
+ print("=" * 80)
281
+
282
+ # Create sample data
283
+ sample_data = [
284
+ {
285
+ "lead_id": "L001",
286
+ "email": "john@techcorp.com",
287
+ "company": "TechCorp",
288
+ "industry": "Software",
289
+ "company_size": "100-500",
290
+ "job_title": "Data Scientist",
291
+ "lead_source": "Website",
292
+ "created_date": "2025-01-15T10:30:00",
293
+ "lead_score": 85,
294
+ "status": "New"
295
+ },
296
+ {
297
+ "lead_id": "L002",
298
+ "email": "sarah@datainc.com",
299
+ "company": "Data Inc",
300
+ "industry": "Analytics",
301
+ "company_size": "50-100",
302
+ "job_title": "ML Engineer",
303
+ "lead_source": "LinkedIn",
304
+ "created_date": "2025-01-16T14:20:00",
305
+ "lead_score": 92,
306
+ "status": "Qualified"
307
+ }
308
+ ]
309
+
310
+ # Save as JSON
311
+ sample_path = "/tmp/sample_crm_leads.json"
312
+ with open(sample_path, 'w') as f:
313
+ json.dump(sample_data, f)
314
+
315
+ # Initialize Bronze loader
316
+ bronze = BronzeLoader(
317
+ bronze_path="./lakehouse/bronze",
318
+ quarantine_path="./lakehouse/quarantine"
319
+ )
320
+
321
+ # Ingest data
322
+ metrics = bronze.ingest_json(
323
+ json_path=sample_path,
324
+ table_name="crm_leads",
325
+ multiline=True,
326
+ schema=CRM_LEADS_SCHEMA
327
+ )
328
+
329
+ print("\n📊 Ingestion Metrics:")
330
+ print(json.dumps(metrics, indent=2))
331
+
332
+ # Query bronze table
333
+ print("\n📋 Bronze Table Sample:")
334
+ bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
335
+ bronze_df.show(truncate=False)
336
+
337
+ print(f"\nBronze table row count: {bronze_df.count()}")
@@ -0,0 +1,300 @@
1
+ -- Medallion Architecture SQL Patterns
2
+ -- Bronze → Silver → Gold transformations for Data Lakehouse
3
+
4
+ -- ================================================================
5
+ -- BRONZE LAYER - Raw Data Ingestion
6
+ -- ================================================================
7
+
8
+ -- View bronze layer with metadata
9
+ SELECT
10
+ *,
11
+ _bronze_ingestion_timestamp,
12
+ _bronze_source_path,
13
+ _bronze_table_name
14
+ FROM bronze.crm_leads
15
+ WHERE _bronze_ingestion_date >= CURRENT_DATE - INTERVAL '7 days'
16
+ ORDER BY _bronze_ingestion_timestamp DESC;
17
+
18
+ -- Check for duplicate records in bronze
19
+ SELECT
20
+ lead_id,
21
+ COUNT(*) as duplicate_count,
22
+ MIN(_bronze_ingestion_timestamp) as first_seen,
23
+ MAX(_bronze_ingestion_timestamp) as last_seen
24
+ FROM bronze.crm_leads
25
+ GROUP BY lead_id
26
+ HAVING COUNT(*) > 1;
27
+
28
+ -- Bronze layer data quality check
29
+ SELECT
30
+ _bronze_ingestion_date,
31
+ COUNT(*) as total_records,
32
+ COUNT(DISTINCT lead_id) as unique_leads,
33
+ COUNT(*) - COUNT(DISTINCT lead_id) as duplicates,
34
+ COUNT(CASE WHEN email IS NULL THEN 1 END) as missing_email,
35
+ COUNT(CASE WHEN company IS NULL THEN 1 END) as missing_company
36
+ FROM bronze.crm_leads
37
+ GROUP BY _bronze_ingestion_date
38
+ ORDER BY _bronze_ingestion_date DESC;
39
+
40
+ -- ================================================================
41
+ -- SILVER LAYER - Cleaned & Standardized
42
+ -- ================================================================
43
+
44
+ -- Transform Bronze → Silver (Deduplication & Cleaning)
45
+ CREATE OR REPLACE TABLE silver.crm_leads_clean AS
46
+ WITH deduplicated AS (
47
+ SELECT *,
48
+ ROW_NUMBER() OVER (
49
+ PARTITION BY lead_id
50
+ ORDER BY _bronze_ingestion_timestamp DESC
51
+ ) as rn
52
+ FROM bronze.crm_leads
53
+ ),
54
+ cleaned AS (
55
+ SELECT
56
+ lead_id,
57
+ LOWER(TRIM(email)) as email,
58
+ TRIM(company) as company,
59
+ UPPER(industry) as industry,
60
+ company_size,
61
+ job_title,
62
+ lead_source,
63
+ created_date,
64
+ COALESCE(lead_score, 0) as lead_score,
65
+ UPPER(status) as status,
66
+ -- Silver metadata
67
+ CURRENT_TIMESTAMP() as _silver_processed_timestamp,
68
+ _bronze_ingestion_timestamp as _bronze_ingestion_timestamp
69
+ FROM deduplicated
70
+ WHERE rn = 1 -- Keep only most recent version
71
+ AND email IS NOT NULL -- Basic validation
72
+ AND email LIKE '%@%' -- Email format check
73
+ )
74
+ SELECT * FROM cleaned;
75
+
76
+ -- Silver layer quality metrics
77
+ SELECT
78
+ COUNT(*) as total_records,
79
+ COUNT(DISTINCT email) as unique_emails,
80
+ COUNT(CASE WHEN lead_score >= 80 THEN 1 END) as high_score_leads,
81
+ AVG(lead_score) as avg_lead_score,
82
+ COUNT(DISTINCT industry) as unique_industries,
83
+ COUNT(DISTINCT company) as unique_companies,
84
+ MAX(_silver_processed_timestamp) as last_processed
85
+ FROM silver.crm_leads_clean;
86
+
87
+ -- Schema drift detection (Silver)
88
+ SELECT
89
+ column_name,
90
+ data_type,
91
+ is_nullable,
92
+ COUNT(*) OVER () as total_columns
93
+ FROM information_schema.columns
94
+ WHERE table_schema = 'silver'
95
+ AND table_name = 'crm_leads_clean'
96
+ ORDER BY ordinal_position;
97
+
98
+ -- ================================================================
99
+ -- GOLD LAYER - Business Logic & Aggregations
100
+ -- ================================================================
101
+
102
+ -- Transform Silver → Gold (Lead Segmentation)
103
+ CREATE OR REPLACE TABLE gold.lead_segments AS
104
+ SELECT
105
+ lead_id,
106
+ email,
107
+ company,
108
+ industry,
109
+ company_size,
110
+ job_title,
111
+ lead_source,
112
+ created_date,
113
+ lead_score,
114
+ status,
115
+ -- Business logic: Lead segment
116
+ CASE
117
+ WHEN lead_score >= 90 THEN 'HOT'
118
+ WHEN lead_score >= 70 THEN 'WARM'
119
+ WHEN lead_score >= 50 THEN 'QUALIFIED'
120
+ ELSE 'COLD'
121
+ END as lead_segment,
122
+ -- Seniority level from job title
123
+ CASE
124
+ WHEN UPPER(job_title) LIKE '%VP%' OR UPPER(job_title) LIKE '%VICE PRESIDENT%' THEN 'VP+'
125
+ WHEN UPPER(job_title) LIKE '%DIRECTOR%' THEN 'Director'
126
+ WHEN UPPER(job_title) LIKE '%MANAGER%' THEN 'Manager'
127
+ WHEN UPPER(job_title) LIKE '%SENIOR%' OR UPPER(job_title) LIKE '%SR%' THEN 'Senior IC'
128
+ ELSE 'IC'
129
+ END as seniority_level,
130
+ -- Company size category
131
+ CASE
132
+ WHEN company_size IN ('1000+', '500-1000') THEN 'Enterprise'
133
+ WHEN company_size IN ('100-500', '50-100') THEN 'Mid-Market'
134
+ ELSE 'SMB'
135
+ END as company_category,
136
+ -- Days since creation
137
+ DATEDIFF(CURRENT_DATE, created_date) as days_since_created,
138
+ -- Gold metadata
139
+ CURRENT_TIMESTAMP() as _gold_created_timestamp
140
+ FROM silver.crm_leads_clean;
141
+
142
+ -- Gold Layer: Daily Lead Metrics
143
+ CREATE OR REPLACE TABLE gold.daily_lead_metrics AS
144
+ SELECT
145
+ DATE(created_date) as metric_date,
146
+ lead_source,
147
+ lead_segment,
148
+ company_category,
149
+ COUNT(*) as lead_count,
150
+ AVG(lead_score) as avg_lead_score,
151
+ COUNT(CASE WHEN lead_segment = 'HOT' THEN 1 END) as hot_leads,
152
+ COUNT(CASE WHEN status = 'QUALIFIED' THEN 1 END) as qualified_leads,
153
+ COUNT(DISTINCT company) as unique_companies,
154
+ COUNT(DISTINCT industry) as unique_industries
155
+ FROM gold.lead_segments
156
+ GROUP BY
157
+ DATE(created_date),
158
+ lead_source,
159
+ lead_segment,
160
+ company_category;
161
+
162
+ -- Gold Layer: Lead Source Performance
163
+ CREATE OR REPLACE VIEW gold.lead_source_performance AS
164
+ SELECT
165
+ lead_source,
166
+ COUNT(*) as total_leads,
167
+ AVG(lead_score) as avg_score,
168
+ COUNT(CASE WHEN lead_segment = 'HOT' THEN 1 END) as hot_leads,
169
+ COUNT(CASE WHEN lead_segment IN ('HOT', 'WARM') THEN 1 END) as quality_leads,
170
+ ROUND(100.0 * COUNT(CASE WHEN lead_segment IN ('HOT', 'WARM') THEN 1 END) / COUNT(*), 2) as quality_rate,
171
+ COUNT(DISTINCT company) as unique_companies,
172
+ MAX(created_date) as latest_lead_date,
173
+ DATEDIFF(CURRENT_DATE, MAX(created_date)) as days_since_last_lead
174
+ FROM gold.lead_segments
175
+ GROUP BY lead_source
176
+ ORDER BY quality_rate DESC;
177
+
178
+ -- Gold Layer: Industry Analysis
179
+ CREATE OR REPLACE VIEW gold.industry_analysis AS
180
+ SELECT
181
+ industry,
182
+ company_category,
183
+ COUNT(*) as lead_count,
184
+ AVG(lead_score) as avg_lead_score,
185
+ COUNT(CASE WHEN lead_segment = 'HOT' THEN 1 END) as hot_leads,
186
+ COUNT(CASE WHEN seniority_level IN ('VP+', 'Director') THEN 1 END) as senior_decision_makers,
187
+ COUNT(DISTINCT company) as unique_companies,
188
+ ROUND(AVG(days_since_created), 1) as avg_age_days
189
+ FROM gold.lead_segments
190
+ GROUP BY industry, company_category
191
+ HAVING COUNT(*) >= 10
192
+ ORDER BY hot_leads DESC, avg_lead_score DESC;
193
+
194
+ -- ================================================================
195
+ -- INCREMENTAL PROCESSING PATTERNS
196
+ -- ================================================================
197
+
198
+ -- Incremental load: Bronze to Silver (only new/updated records)
199
+ MERGE INTO silver.crm_leads_clean AS target
200
+ USING (
201
+ SELECT
202
+ lead_id,
203
+ LOWER(TRIM(email)) as email,
204
+ TRIM(company) as company,
205
+ UPPER(industry) as industry,
206
+ company_size,
207
+ job_title,
208
+ lead_source,
209
+ created_date,
210
+ COALESCE(lead_score, 0) as lead_score,
211
+ UPPER(status) as status,
212
+ _bronze_ingestion_timestamp
213
+ FROM (
214
+ SELECT *,
215
+ ROW_NUMBER() OVER (
216
+ PARTITION BY lead_id
217
+ ORDER BY _bronze_ingestion_timestamp DESC
218
+ ) as rn
219
+ FROM bronze.crm_leads
220
+ WHERE _bronze_ingestion_timestamp > (
221
+ SELECT COALESCE(MAX(_bronze_ingestion_timestamp), '1900-01-01')
222
+ FROM silver.crm_leads_clean
223
+ )
224
+ )
225
+ WHERE rn = 1
226
+ AND email IS NOT NULL
227
+ AND email LIKE '%@%'
228
+ ) AS source
229
+ ON target.lead_id = source.lead_id
230
+ WHEN MATCHED THEN
231
+ UPDATE SET
232
+ email = source.email,
233
+ company = source.company,
234
+ industry = source.industry,
235
+ company_size = source.company_size,
236
+ job_title = source.job_title,
237
+ lead_source = source.lead_source,
238
+ created_date = source.created_date,
239
+ lead_score = source.lead_score,
240
+ status = source.status,
241
+ _silver_processed_timestamp = CURRENT_TIMESTAMP(),
242
+ _bronze_ingestion_timestamp = source._bronze_ingestion_timestamp
243
+ WHEN NOT MATCHED THEN
244
+ INSERT (
245
+ lead_id, email, company, industry, company_size,
246
+ job_title, lead_source, created_date, lead_score, status,
247
+ _silver_processed_timestamp, _bronze_ingestion_timestamp
248
+ )
249
+ VALUES (
250
+ source.lead_id, source.email, source.company, source.industry,
251
+ source.company_size, source.job_title, source.lead_source,
252
+ source.created_date, source.lead_score, source.status,
253
+ CURRENT_TIMESTAMP(), source._bronze_ingestion_timestamp
254
+ );
255
+
256
+ -- ================================================================
257
+ -- DATA QUALITY MONITORING
258
+ -- ================================================================
259
+
260
+ -- Cross-layer data quality dashboard
261
+ SELECT
262
+ 'Bronze' as layer,
263
+ COUNT(*) as record_count,
264
+ COUNT(DISTINCT lead_id) as unique_ids,
265
+ MAX(_bronze_ingestion_timestamp) as last_update
266
+ FROM bronze.crm_leads
267
+
268
+ UNION ALL
269
+
270
+ SELECT
271
+ 'Silver' as layer,
272
+ COUNT(*) as record_count,
273
+ COUNT(DISTINCT lead_id) as unique_ids,
274
+ MAX(_silver_processed_timestamp) as last_update
275
+ FROM silver.crm_leads_clean
276
+
277
+ UNION ALL
278
+
279
+ SELECT
280
+ 'Gold' as layer,
281
+ COUNT(*) as record_count,
282
+ COUNT(DISTINCT lead_id) as unique_ids,
283
+ MAX(_gold_created_timestamp) as last_update
284
+ FROM gold.lead_segments;
285
+
286
+ -- ================================================================
287
+ -- PERFORMANCE OPTIMIZATION
288
+ -- ================================================================
289
+
290
+ -- Optimize Silver table (Vacuum + Optimize)
291
+ -- OPTIMIZE silver.crm_leads_clean ZORDER BY (lead_id, created_date);
292
+ -- VACUUM silver.crm_leads_clean RETAIN 168 HOURS; -- 7 days
293
+
294
+ -- Optimize Gold table
295
+ -- OPTIMIZE gold.lead_segments ZORDER BY (lead_segment, created_date, company_category);
296
+ -- VACUUM gold.lead_segments RETAIN 168 HOURS;
297
+
298
+ -- Table statistics for query optimization
299
+ -- ANALYZE TABLE silver.crm_leads_clean COMPUTE STATISTICS;
300
+ -- ANALYZE TABLE gold.lead_segments COMPUTE STATISTICS FOR ALL COLUMNS;