tech-hub-skills 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/.claude/LICENSE +21 -21
  2. package/.claude/README.md +291 -291
  3. package/.claude/bin/cli.js +266 -266
  4. package/.claude/bin/copilot.js +182 -182
  5. package/.claude/bin/postinstall.js +42 -42
  6. package/.claude/commands/README.md +336 -336
  7. package/.claude/commands/ai-engineer.md +104 -104
  8. package/.claude/commands/aws.md +143 -143
  9. package/.claude/commands/azure.md +149 -149
  10. package/.claude/commands/backend-developer.md +108 -108
  11. package/.claude/commands/code-review.md +399 -399
  12. package/.claude/commands/compliance-automation.md +747 -747
  13. package/.claude/commands/compliance-officer.md +108 -108
  14. package/.claude/commands/data-engineer.md +113 -113
  15. package/.claude/commands/data-governance.md +102 -102
  16. package/.claude/commands/data-scientist.md +123 -123
  17. package/.claude/commands/database-admin.md +109 -109
  18. package/.claude/commands/devops.md +160 -160
  19. package/.claude/commands/docker.md +160 -160
  20. package/.claude/commands/enterprise-dashboard.md +613 -613
  21. package/.claude/commands/finops.md +184 -184
  22. package/.claude/commands/frontend-developer.md +108 -108
  23. package/.claude/commands/gcp.md +143 -143
  24. package/.claude/commands/ml-engineer.md +115 -115
  25. package/.claude/commands/mlops.md +187 -187
  26. package/.claude/commands/network-engineer.md +109 -109
  27. package/.claude/commands/optimization-advisor.md +329 -329
  28. package/.claude/commands/orchestrator.md +623 -623
  29. package/.claude/commands/platform-engineer.md +102 -102
  30. package/.claude/commands/process-automation.md +226 -226
  31. package/.claude/commands/process-changelog.md +184 -184
  32. package/.claude/commands/process-documentation.md +484 -484
  33. package/.claude/commands/process-kanban.md +324 -324
  34. package/.claude/commands/process-versioning.md +214 -214
  35. package/.claude/commands/product-designer.md +104 -104
  36. package/.claude/commands/project-starter.md +443 -443
  37. package/.claude/commands/qa-engineer.md +109 -109
  38. package/.claude/commands/security-architect.md +135 -135
  39. package/.claude/commands/sre.md +109 -109
  40. package/.claude/commands/system-design.md +126 -126
  41. package/.claude/commands/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -46
  43. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
  47. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
  50. package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/.claude/roles/azure/skills/05-functions/README.md +264 -264
  59. package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/.claude/roles/azure/skills/10-networking/README.md +264 -264
  64. package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
  74. package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
  86. package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
  158. package/.claude/skills/README.md +336 -336
  159. package/.claude/skills/ai-engineer.md +104 -104
  160. package/.claude/skills/aws.md +143 -143
  161. package/.claude/skills/azure.md +149 -149
  162. package/.claude/skills/backend-developer.md +108 -108
  163. package/.claude/skills/code-review.md +399 -399
  164. package/.claude/skills/compliance-automation.md +747 -747
  165. package/.claude/skills/compliance-officer.md +108 -108
  166. package/.claude/skills/data-engineer.md +113 -113
  167. package/.claude/skills/data-governance.md +102 -102
  168. package/.claude/skills/data-scientist.md +123 -123
  169. package/.claude/skills/database-admin.md +109 -109
  170. package/.claude/skills/devops.md +160 -160
  171. package/.claude/skills/docker.md +160 -160
  172. package/.claude/skills/enterprise-dashboard.md +613 -613
  173. package/.claude/skills/finops.md +184 -184
  174. package/.claude/skills/frontend-developer.md +108 -108
  175. package/.claude/skills/gcp.md +143 -143
  176. package/.claude/skills/ml-engineer.md +115 -115
  177. package/.claude/skills/mlops.md +187 -187
  178. package/.claude/skills/network-engineer.md +109 -109
  179. package/.claude/skills/optimization-advisor.md +329 -329
  180. package/.claude/skills/orchestrator.md +623 -623
  181. package/.claude/skills/platform-engineer.md +102 -102
  182. package/.claude/skills/process-automation.md +226 -226
  183. package/.claude/skills/process-changelog.md +184 -184
  184. package/.claude/skills/process-documentation.md +484 -484
  185. package/.claude/skills/process-kanban.md +324 -324
  186. package/.claude/skills/process-versioning.md +214 -214
  187. package/.claude/skills/product-designer.md +104 -104
  188. package/.claude/skills/project-starter.md +443 -443
  189. package/.claude/skills/qa-engineer.md +109 -109
  190. package/.claude/skills/security-architect.md +135 -135
  191. package/.claude/skills/sre.md +109 -109
  192. package/.claude/skills/system-design.md +126 -126
  193. package/.claude/skills/technical-writer.md +101 -101
  194. package/.gitattributes +2 -2
  195. package/GITHUB_COPILOT.md +106 -106
  196. package/README.md +192 -184
  197. package/package.json +16 -8
@@ -1,337 +1,337 @@
1
- """
2
- Bronze Layer: Raw Data Ingestion
3
- Ingest data from multiple sources with validation and error handling.
4
- """
5
-
6
- import json
7
- import os
8
- from datetime import datetime
9
- from pathlib import Path
10
- from typing import Dict, Any, List, Optional, Union
11
- import pandas as pd
12
- from pyspark.sql import SparkSession, DataFrame
13
- from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
14
- from pyspark.sql import functions as F
15
- import logging
16
-
17
- logging.basicConfig(level=logging.INFO)
18
- logger = logging.getLogger(__name__)
19
-
20
-
21
- class BronzeLoader:
22
- """
23
- Bronze layer ingestion with schema validation and audit logging.
24
-
25
- Bronze layer principles:
26
- - Append-only (preserve full history)
27
- - Raw data with minimal transformation
28
- - Add metadata (ingestion timestamp, source, file name)
29
- - Schema validation
30
- - Error quarantine
31
- """
32
-
33
- def __init__(
34
- self,
35
- spark: Optional[SparkSession] = None,
36
- bronze_path: str = "/lakehouse/bronze",
37
- quarantine_path: str = "/lakehouse/quarantine"
38
- ):
39
- """
40
- Initialize Bronze loader.
41
-
42
- Args:
43
- spark: SparkSession (creates one if not provided)
44
- bronze_path: Path to bronze layer storage
45
- quarantine_path: Path for invalid records
46
- """
47
- self.spark = spark or self._create_spark_session()
48
- self.bronze_path = bronze_path
49
- self.quarantine_path = quarantine_path
50
-
51
- # Create directories if they don't exist
52
- Path(bronze_path).mkdir(parents=True, exist_ok=True)
53
- Path(quarantine_path).mkdir(parents=True, exist_ok=True)
54
-
55
- def _create_spark_session(self) -> SparkSession:
56
- """Create Spark session with Delta Lake support."""
57
- return SparkSession.builder \
58
- .appName("BronzeIngestion") \
59
- .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
60
- .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
61
- .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
62
- .getOrCreate()
63
-
64
- def ingest_from_source(
65
- self,
66
- source_path: str,
67
- table_name: str,
68
- source_format: str = "json",
69
- schema: Optional[StructType] = None,
70
- options: Optional[Dict[str, str]] = None
71
- ) -> Dict[str, Any]:
72
- """
73
- Ingest data from source into Bronze layer.
74
-
75
- Args:
76
- source_path: Path to source data
77
- table_name: Name for bronze table
78
- source_format: Format (json, csv, parquet, etc.)
79
- schema: Optional schema to enforce
80
- options: Additional read options
81
-
82
- Returns:
83
- Ingestion metrics
84
- """
85
- logger.info(f"Starting ingestion: {table_name} from {source_path}")
86
-
87
- try:
88
- # Read source data
89
- df = self._read_source(source_path, source_format, schema, options)
90
-
91
- # Add bronze layer metadata
92
- df_bronze = self._add_bronze_metadata(df, source_path, table_name)
93
-
94
- # Validate schema if provided
95
- if schema:
96
- df_bronze = self._validate_schema(df_bronze, schema)
97
-
98
- # Write to bronze layer
99
- bronze_table_path = f"{self.bronze_path}/{table_name}"
100
-
101
- df_bronze.write \
102
- .format("delta") \
103
- .mode("append") \
104
- .option("mergeSchema", "true") \
105
- .save(bronze_table_path)
106
-
107
- # Collect metrics
108
- record_count = df_bronze.count()
109
-
110
- metrics = {
111
- "status": "success",
112
- "table_name": table_name,
113
- "records_ingested": record_count,
114
- "source_path": source_path,
115
- "ingestion_timestamp": datetime.now().isoformat(),
116
- "bronze_path": bronze_table_path
117
- }
118
-
119
- logger.info(f"āœ… Successfully ingested {record_count} records to {table_name}")
120
-
121
- return metrics
122
-
123
- except Exception as e:
124
- logger.error(f"āŒ Ingestion failed: {str(e)}")
125
-
126
- return {
127
- "status": "failed",
128
- "table_name": table_name,
129
- "error": str(e),
130
- "ingestion_timestamp": datetime.now().isoformat()
131
- }
132
-
133
- def _read_source(
134
- self,
135
- source_path: str,
136
- source_format: str,
137
- schema: Optional[StructType] = None,
138
- options: Optional[Dict[str, str]] = None
139
- ) -> DataFrame:
140
- """Read data from source."""
141
- options = options or {}
142
-
143
- reader = self.spark.read.format(source_format)
144
-
145
- if schema:
146
- reader = reader.schema(schema)
147
-
148
- for key, value in options.items():
149
- reader = reader.option(key, value)
150
-
151
- return reader.load(source_path)
152
-
153
- def _add_bronze_metadata(
154
- self,
155
- df: DataFrame,
156
- source_path: str,
157
- table_name: str
158
- ) -> DataFrame:
159
- """Add bronze layer audit columns."""
160
- return df \
161
- .withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
162
- .withColumn("_bronze_source_path", F.lit(source_path)) \
163
- .withColumn("_bronze_table_name", F.lit(table_name)) \
164
- .withColumn("_bronze_ingestion_date", F.current_date())
165
-
166
- def _validate_schema(
167
- self,
168
- df: DataFrame,
169
- expected_schema: StructType
170
- ) -> DataFrame:
171
- """
172
- Validate DataFrame against expected schema.
173
-
174
- Quarantine records that don't match schema.
175
- """
176
- # In production, implement sophisticated schema validation
177
- # For now, we return the df as-is
178
- return df
179
-
180
- def ingest_csv(
181
- self,
182
- csv_path: str,
183
- table_name: str,
184
- delimiter: str = ",",
185
- header: bool = True,
186
- schema: Optional[StructType] = None
187
- ) -> Dict[str, Any]:
188
- """Convenience method for CSV ingestion."""
189
- options = {
190
- "delimiter": delimiter,
191
- "header": str(header).lower(),
192
- "inferSchema": "true" if schema is None else "false"
193
- }
194
-
195
- return self.ingest_from_source(
196
- source_path=csv_path,
197
- table_name=table_name,
198
- source_format="csv",
199
- schema=schema,
200
- options=options
201
- )
202
-
203
- def ingest_json(
204
- self,
205
- json_path: str,
206
- table_name: str,
207
- multiline: bool = False,
208
- schema: Optional[StructType] = None
209
- ) -> Dict[str, Any]:
210
- """Convenience method for JSON ingestion."""
211
- options = {
212
- "multiLine": str(multiline).lower()
213
- }
214
-
215
- return self.ingest_from_source(
216
- source_path=json_path,
217
- table_name=table_name,
218
- source_format="json",
219
- schema=schema,
220
- options=options
221
- )
222
-
223
- def ingest_parquet(
224
- self,
225
- parquet_path: str,
226
- table_name: str
227
- ) -> Dict[str, Any]:
228
- """Convenience method for Parquet ingestion."""
229
- return self.ingest_from_source(
230
- source_path=parquet_path,
231
- table_name=table_name,
232
- source_format="parquet"
233
- )
234
-
235
- def create_bronze_table(
236
- self,
237
- table_name: str,
238
- schema: StructType,
239
- partition_by: Optional[List[str]] = None
240
- ) -> None:
241
- """Create an empty bronze table with schema."""
242
- bronze_table_path = f"{self.bronze_path}/{table_name}"
243
-
244
- # Create empty DataFrame with schema
245
- empty_df = self.spark.createDataFrame([], schema)
246
-
247
- # Add bronze metadata columns
248
- bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
249
-
250
- # Write table
251
- writer = bronze_df.write.format("delta").mode("overwrite")
252
-
253
- if partition_by:
254
- writer = writer.partitionBy(*partition_by)
255
-
256
- writer.save(bronze_table_path)
257
-
258
- logger.info(f"āœ… Created bronze table: {table_name}")
259
-
260
-
261
- # Example CRM schema
262
- CRM_LEADS_SCHEMA = StructType([
263
- StructField("lead_id", StringType(), False),
264
- StructField("email", StringType(), True),
265
- StructField("company", StringType(), True),
266
- StructField("industry", StringType(), True),
267
- StructField("company_size", StringType(), True),
268
- StructField("job_title", StringType(), True),
269
- StructField("lead_source", StringType(), True),
270
- StructField("created_date", TimestampType(), True),
271
- StructField("lead_score", IntegerType(), True),
272
- StructField("status", StringType(), True)
273
- ])
274
-
275
-
276
- # Example usage
277
- if __name__ == "__main__":
278
- print("=" * 80)
279
- print("Bronze Layer Ingestion Demo")
280
- print("=" * 80)
281
-
282
- # Create sample data
283
- sample_data = [
284
- {
285
- "lead_id": "L001",
286
- "email": "john@techcorp.com",
287
- "company": "TechCorp",
288
- "industry": "Software",
289
- "company_size": "100-500",
290
- "job_title": "Data Scientist",
291
- "lead_source": "Website",
292
- "created_date": "2025-01-15T10:30:00",
293
- "lead_score": 85,
294
- "status": "New"
295
- },
296
- {
297
- "lead_id": "L002",
298
- "email": "sarah@datainc.com",
299
- "company": "Data Inc",
300
- "industry": "Analytics",
301
- "company_size": "50-100",
302
- "job_title": "ML Engineer",
303
- "lead_source": "LinkedIn",
304
- "created_date": "2025-01-16T14:20:00",
305
- "lead_score": 92,
306
- "status": "Qualified"
307
- }
308
- ]
309
-
310
- # Save as JSON
311
- sample_path = "/tmp/sample_crm_leads.json"
312
- with open(sample_path, 'w') as f:
313
- json.dump(sample_data, f)
314
-
315
- # Initialize Bronze loader
316
- bronze = BronzeLoader(
317
- bronze_path="./lakehouse/bronze",
318
- quarantine_path="./lakehouse/quarantine"
319
- )
320
-
321
- # Ingest data
322
- metrics = bronze.ingest_json(
323
- json_path=sample_path,
324
- table_name="crm_leads",
325
- multiline=True,
326
- schema=CRM_LEADS_SCHEMA
327
- )
328
-
329
- print("\nšŸ“Š Ingestion Metrics:")
330
- print(json.dumps(metrics, indent=2))
331
-
332
- # Query bronze table
333
- print("\nšŸ“‹ Bronze Table Sample:")
334
- bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
335
- bronze_df.show(truncate=False)
336
-
337
- print(f"\nBronze table row count: {bronze_df.count()}")
1
+ """
2
+ Bronze Layer: Raw Data Ingestion
3
+ Ingest data from multiple sources with validation and error handling.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Dict, Any, List, Optional, Union
11
+ import pandas as pd
12
+ from pyspark.sql import SparkSession, DataFrame
13
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
14
+ from pyspark.sql import functions as F
15
+ import logging
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BronzeLoader:
22
+ """
23
+ Bronze layer ingestion with schema validation and audit logging.
24
+
25
+ Bronze layer principles:
26
+ - Append-only (preserve full history)
27
+ - Raw data with minimal transformation
28
+ - Add metadata (ingestion timestamp, source, file name)
29
+ - Schema validation
30
+ - Error quarantine
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ spark: Optional[SparkSession] = None,
36
+ bronze_path: str = "/lakehouse/bronze",
37
+ quarantine_path: str = "/lakehouse/quarantine"
38
+ ):
39
+ """
40
+ Initialize Bronze loader.
41
+
42
+ Args:
43
+ spark: SparkSession (creates one if not provided)
44
+ bronze_path: Path to bronze layer storage
45
+ quarantine_path: Path for invalid records
46
+ """
47
+ self.spark = spark or self._create_spark_session()
48
+ self.bronze_path = bronze_path
49
+ self.quarantine_path = quarantine_path
50
+
51
+ # Create directories if they don't exist
52
+ Path(bronze_path).mkdir(parents=True, exist_ok=True)
53
+ Path(quarantine_path).mkdir(parents=True, exist_ok=True)
54
+
55
+ def _create_spark_session(self) -> SparkSession:
56
+ """Create Spark session with Delta Lake support."""
57
+ return SparkSession.builder \
58
+ .appName("BronzeIngestion") \
59
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
60
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
61
+ .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
62
+ .getOrCreate()
63
+
64
+ def ingest_from_source(
65
+ self,
66
+ source_path: str,
67
+ table_name: str,
68
+ source_format: str = "json",
69
+ schema: Optional[StructType] = None,
70
+ options: Optional[Dict[str, str]] = None
71
+ ) -> Dict[str, Any]:
72
+ """
73
+ Ingest data from source into Bronze layer.
74
+
75
+ Args:
76
+ source_path: Path to source data
77
+ table_name: Name for bronze table
78
+ source_format: Format (json, csv, parquet, etc.)
79
+ schema: Optional schema to enforce
80
+ options: Additional read options
81
+
82
+ Returns:
83
+ Ingestion metrics
84
+ """
85
+ logger.info(f"Starting ingestion: {table_name} from {source_path}")
86
+
87
+ try:
88
+ # Read source data
89
+ df = self._read_source(source_path, source_format, schema, options)
90
+
91
+ # Add bronze layer metadata
92
+ df_bronze = self._add_bronze_metadata(df, source_path, table_name)
93
+
94
+ # Validate schema if provided
95
+ if schema:
96
+ df_bronze = self._validate_schema(df_bronze, schema)
97
+
98
+ # Write to bronze layer
99
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
100
+
101
+ df_bronze.write \
102
+ .format("delta") \
103
+ .mode("append") \
104
+ .option("mergeSchema", "true") \
105
+ .save(bronze_table_path)
106
+
107
+ # Collect metrics
108
+ record_count = df_bronze.count()
109
+
110
+ metrics = {
111
+ "status": "success",
112
+ "table_name": table_name,
113
+ "records_ingested": record_count,
114
+ "source_path": source_path,
115
+ "ingestion_timestamp": datetime.now().isoformat(),
116
+ "bronze_path": bronze_table_path
117
+ }
118
+
119
+ logger.info(f"āœ… Successfully ingested {record_count} records to {table_name}")
120
+
121
+ return metrics
122
+
123
+ except Exception as e:
124
+ logger.error(f"āŒ Ingestion failed: {str(e)}")
125
+
126
+ return {
127
+ "status": "failed",
128
+ "table_name": table_name,
129
+ "error": str(e),
130
+ "ingestion_timestamp": datetime.now().isoformat()
131
+ }
132
+
133
+ def _read_source(
134
+ self,
135
+ source_path: str,
136
+ source_format: str,
137
+ schema: Optional[StructType] = None,
138
+ options: Optional[Dict[str, str]] = None
139
+ ) -> DataFrame:
140
+ """Read data from source."""
141
+ options = options or {}
142
+
143
+ reader = self.spark.read.format(source_format)
144
+
145
+ if schema:
146
+ reader = reader.schema(schema)
147
+
148
+ for key, value in options.items():
149
+ reader = reader.option(key, value)
150
+
151
+ return reader.load(source_path)
152
+
153
+ def _add_bronze_metadata(
154
+ self,
155
+ df: DataFrame,
156
+ source_path: str,
157
+ table_name: str
158
+ ) -> DataFrame:
159
+ """Add bronze layer audit columns."""
160
+ return df \
161
+ .withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
162
+ .withColumn("_bronze_source_path", F.lit(source_path)) \
163
+ .withColumn("_bronze_table_name", F.lit(table_name)) \
164
+ .withColumn("_bronze_ingestion_date", F.current_date())
165
+
166
+ def _validate_schema(
167
+ self,
168
+ df: DataFrame,
169
+ expected_schema: StructType
170
+ ) -> DataFrame:
171
+ """
172
+ Validate DataFrame against expected schema.
173
+
174
+ Quarantine records that don't match schema.
175
+ """
176
+ # In production, implement sophisticated schema validation
177
+ # For now, we return the df as-is
178
+ return df
179
+
180
+ def ingest_csv(
181
+ self,
182
+ csv_path: str,
183
+ table_name: str,
184
+ delimiter: str = ",",
185
+ header: bool = True,
186
+ schema: Optional[StructType] = None
187
+ ) -> Dict[str, Any]:
188
+ """Convenience method for CSV ingestion."""
189
+ options = {
190
+ "delimiter": delimiter,
191
+ "header": str(header).lower(),
192
+ "inferSchema": "true" if schema is None else "false"
193
+ }
194
+
195
+ return self.ingest_from_source(
196
+ source_path=csv_path,
197
+ table_name=table_name,
198
+ source_format="csv",
199
+ schema=schema,
200
+ options=options
201
+ )
202
+
203
+ def ingest_json(
204
+ self,
205
+ json_path: str,
206
+ table_name: str,
207
+ multiline: bool = False,
208
+ schema: Optional[StructType] = None
209
+ ) -> Dict[str, Any]:
210
+ """Convenience method for JSON ingestion."""
211
+ options = {
212
+ "multiLine": str(multiline).lower()
213
+ }
214
+
215
+ return self.ingest_from_source(
216
+ source_path=json_path,
217
+ table_name=table_name,
218
+ source_format="json",
219
+ schema=schema,
220
+ options=options
221
+ )
222
+
223
+ def ingest_parquet(
224
+ self,
225
+ parquet_path: str,
226
+ table_name: str
227
+ ) -> Dict[str, Any]:
228
+ """Convenience method for Parquet ingestion."""
229
+ return self.ingest_from_source(
230
+ source_path=parquet_path,
231
+ table_name=table_name,
232
+ source_format="parquet"
233
+ )
234
+
235
+ def create_bronze_table(
236
+ self,
237
+ table_name: str,
238
+ schema: StructType,
239
+ partition_by: Optional[List[str]] = None
240
+ ) -> None:
241
+ """Create an empty bronze table with schema."""
242
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
243
+
244
+ # Create empty DataFrame with schema
245
+ empty_df = self.spark.createDataFrame([], schema)
246
+
247
+ # Add bronze metadata columns
248
+ bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
249
+
250
+ # Write table
251
+ writer = bronze_df.write.format("delta").mode("overwrite")
252
+
253
+ if partition_by:
254
+ writer = writer.partitionBy(*partition_by)
255
+
256
+ writer.save(bronze_table_path)
257
+
258
+ logger.info(f"āœ… Created bronze table: {table_name}")
259
+
260
+
261
+ # Example CRM schema
262
+ CRM_LEADS_SCHEMA = StructType([
263
+ StructField("lead_id", StringType(), False),
264
+ StructField("email", StringType(), True),
265
+ StructField("company", StringType(), True),
266
+ StructField("industry", StringType(), True),
267
+ StructField("company_size", StringType(), True),
268
+ StructField("job_title", StringType(), True),
269
+ StructField("lead_source", StringType(), True),
270
+ StructField("created_date", TimestampType(), True),
271
+ StructField("lead_score", IntegerType(), True),
272
+ StructField("status", StringType(), True)
273
+ ])
274
+
275
+
276
+ # Example usage
277
+ if __name__ == "__main__":
278
+ print("=" * 80)
279
+ print("Bronze Layer Ingestion Demo")
280
+ print("=" * 80)
281
+
282
+ # Create sample data
283
+ sample_data = [
284
+ {
285
+ "lead_id": "L001",
286
+ "email": "john@techcorp.com",
287
+ "company": "TechCorp",
288
+ "industry": "Software",
289
+ "company_size": "100-500",
290
+ "job_title": "Data Scientist",
291
+ "lead_source": "Website",
292
+ "created_date": "2025-01-15T10:30:00",
293
+ "lead_score": 85,
294
+ "status": "New"
295
+ },
296
+ {
297
+ "lead_id": "L002",
298
+ "email": "sarah@datainc.com",
299
+ "company": "Data Inc",
300
+ "industry": "Analytics",
301
+ "company_size": "50-100",
302
+ "job_title": "ML Engineer",
303
+ "lead_source": "LinkedIn",
304
+ "created_date": "2025-01-16T14:20:00",
305
+ "lead_score": 92,
306
+ "status": "Qualified"
307
+ }
308
+ ]
309
+
310
+ # Save as JSON
311
+ sample_path = "/tmp/sample_crm_leads.json"
312
+ with open(sample_path, 'w') as f:
313
+ json.dump(sample_data, f)
314
+
315
+ # Initialize Bronze loader
316
+ bronze = BronzeLoader(
317
+ bronze_path="./lakehouse/bronze",
318
+ quarantine_path="./lakehouse/quarantine"
319
+ )
320
+
321
+ # Ingest data
322
+ metrics = bronze.ingest_json(
323
+ json_path=sample_path,
324
+ table_name="crm_leads",
325
+ multiline=True,
326
+ schema=CRM_LEADS_SCHEMA
327
+ )
328
+
329
+ print("\nšŸ“Š Ingestion Metrics:")
330
+ print(json.dumps(metrics, indent=2))
331
+
332
+ # Query bronze table
333
+ print("\nšŸ“‹ Bronze Table Sample:")
334
+ bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
335
+ bronze_df.show(truncate=False)
336
+
337
+ print(f"\nBronze table row count: {bronze_df.count()}")