tech-hub-skills 1.2.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/.claude/README.md +291 -0
  2. package/.claude/bin/cli.js +266 -0
  3. package/.claude/package.json +46 -0
  4. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  5. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  6. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  7. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  8. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  9. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  10. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  11. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  12. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  13. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  14. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  15. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  16. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  17. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  18. package/.claude/skills/README.md +336 -0
  19. package/.claude/skills/ai-engineer.md +104 -0
  20. package/.claude/skills/aws.md +143 -0
  21. package/.claude/skills/azure.md +149 -0
  22. package/.claude/skills/backend-developer.md +108 -0
  23. package/.claude/skills/code-review.md +399 -0
  24. package/.claude/skills/compliance-automation.md +747 -0
  25. package/.claude/skills/compliance-officer.md +108 -0
  26. package/.claude/skills/data-engineer.md +113 -0
  27. package/.claude/skills/data-governance.md +102 -0
  28. package/.claude/skills/data-scientist.md +123 -0
  29. package/.claude/skills/database-admin.md +109 -0
  30. package/.claude/skills/devops.md +160 -0
  31. package/.claude/skills/docker.md +160 -0
  32. package/.claude/skills/enterprise-dashboard.md +613 -0
  33. package/.claude/skills/finops.md +184 -0
  34. package/.claude/skills/frontend-developer.md +108 -0
  35. package/.claude/skills/gcp.md +143 -0
  36. package/.claude/skills/ml-engineer.md +115 -0
  37. package/.claude/skills/mlops.md +187 -0
  38. package/.claude/skills/network-engineer.md +109 -0
  39. package/.claude/skills/optimization-advisor.md +329 -0
  40. package/.claude/skills/orchestrator.md +623 -0
  41. package/.claude/skills/platform-engineer.md +102 -0
  42. package/.claude/skills/process-automation.md +226 -0
  43. package/.claude/skills/process-changelog.md +184 -0
  44. package/.claude/skills/process-documentation.md +484 -0
  45. package/.claude/skills/process-kanban.md +324 -0
  46. package/.claude/skills/process-versioning.md +214 -0
  47. package/.claude/skills/product-designer.md +104 -0
  48. package/.claude/skills/project-starter.md +443 -0
  49. package/.claude/skills/qa-engineer.md +109 -0
  50. package/.claude/skills/security-architect.md +135 -0
  51. package/.claude/skills/sre.md +109 -0
  52. package/.claude/skills/system-design.md +126 -0
  53. package/.claude/skills/technical-writer.md +101 -0
  54. package/.gitattributes +2 -0
  55. package/GITHUB_COPILOT.md +106 -0
  56. package/README.md +117 -224
  57. package/package.json +4 -42
  58. package/bin/cli.js +0 -241
  59. /package/{LICENSE → .claude/LICENSE} +0 -0
  60. /package/{bin → .claude/bin}/copilot.js +0 -0
  61. /package/{bin → .claude/bin}/postinstall.js +0 -0
  62. /package/{tech_hub_skills/skills → .claude/commands}/README.md +0 -0
  63. /package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +0 -0
  64. /package/{tech_hub_skills/skills → .claude/commands}/aws.md +0 -0
  65. /package/{tech_hub_skills/skills → .claude/commands}/azure.md +0 -0
  66. /package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +0 -0
  67. /package/{tech_hub_skills/skills → .claude/commands}/code-review.md +0 -0
  68. /package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +0 -0
  69. /package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +0 -0
  70. /package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +0 -0
  71. /package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +0 -0
  72. /package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +0 -0
  73. /package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +0 -0
  74. /package/{tech_hub_skills/skills → .claude/commands}/devops.md +0 -0
  75. /package/{tech_hub_skills/skills → .claude/commands}/docker.md +0 -0
  76. /package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +0 -0
  77. /package/{tech_hub_skills/skills → .claude/commands}/finops.md +0 -0
  78. /package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +0 -0
  79. /package/{tech_hub_skills/skills → .claude/commands}/gcp.md +0 -0
  80. /package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +0 -0
  81. /package/{tech_hub_skills/skills → .claude/commands}/mlops.md +0 -0
  82. /package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +0 -0
  83. /package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +0 -0
  84. /package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +0 -0
  85. /package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +0 -0
  86. /package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +0 -0
  87. /package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +0 -0
  88. /package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +0 -0
  89. /package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +0 -0
  90. /package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +0 -0
  91. /package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +0 -0
  92. /package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +0 -0
  93. /package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +0 -0
  94. /package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +0 -0
  95. /package/{tech_hub_skills/skills → .claude/commands}/sre.md +0 -0
  96. /package/{tech_hub_skills/skills → .claude/commands}/system-design.md +0 -0
  97. /package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +0 -0
  98. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +0 -0
  99. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +0 -0
  100. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +0 -0
  101. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +0 -0
  102. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +0 -0
  103. /package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +0 -0
  104. /package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +0 -0
  105. /package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +0 -0
  106. /package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +0 -0
  107. /package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +0 -0
  108. /package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +0 -0
  109. /package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +0 -0
  110. /package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +0 -0
  111. /package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +0 -0
  112. /package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +0 -0
  113. /package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +0 -0
  114. /package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +0 -0
  115. /package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +0 -0
  116. /package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +0 -0
  117. /package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +0 -0
  118. /package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +0 -0
  119. /package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +0 -0
  120. /package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +0 -0
  121. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +0 -0
  122. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +0 -0
  123. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +0 -0
  124. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +0 -0
  125. /package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +0 -0
  126. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +0 -0
  127. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +0 -0
  128. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +0 -0
  129. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +0 -0
  130. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +0 -0
  131. /package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +0 -0
  132. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +0 -0
  133. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +0 -0
  134. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +0 -0
  135. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +0 -0
  136. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +0 -0
  137. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +0 -0
  138. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +0 -0
  139. /package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +0 -0
  140. /package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +0 -0
  141. /package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +0 -0
  142. /package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +0 -0
  143. /package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +0 -0
  144. /package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +0 -0
  145. /package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +0 -0
  146. /package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +0 -0
  147. /package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +0 -0
  148. /package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +0 -0
  149. /package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +0 -0
  150. /package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +0 -0
  151. /package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +0 -0
  152. /package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +0 -0
  153. /package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +0 -0
  154. /package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +0 -0
  155. /package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +0 -0
  156. /package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +0 -0
  157. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +0 -0
  158. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +0 -0
  159. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +0 -0
  160. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +0 -0
  161. /package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +0 -0
  162. /package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +0 -0
  163. /package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +0 -0
  164. /package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +0 -0
  165. /package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +0 -0
  166. /package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +0 -0
  167. /package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +0 -0
  168. /package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +0 -0
  169. /package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +0 -0
  170. /package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +0 -0
  171. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +0 -0
  172. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +0 -0
  173. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +0 -0
  174. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +0 -0
  175. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +0 -0
  176. /package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +0 -0
  177. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +0 -0
  178. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +0 -0
  179. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +0 -0
  180. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +0 -0
  181. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +0 -0
  182. /package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +0 -0
  183. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +0 -0
  184. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +0 -0
  185. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +0 -0
  186. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +0 -0
  187. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +0 -0
  188. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +0 -0
  189. /package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +0 -0
  190. /package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +0 -0
  191. /package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +0 -0
  192. /package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +0 -0
  193. /package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +0 -0
  194. /package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +0 -0
  195. /package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +0 -0
  196. /package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +0 -0
  197. /package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +0 -0
  198. /package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +0 -0
@@ -0,0 +1,213 @@
1
+ -- RAG Pipeline Analytics Queries
2
+ -- Track knowledge base usage, query patterns, and performance
3
+
4
+ -- ================================================================
5
+ -- 1. KNOWLEDGE BASE INVENTORY
6
+ -- ================================================================
7
+
8
+ -- Count documents by source
9
+ SELECT
10
+ source_type,
11
+ COUNT(*) as document_count,
12
+ SUM(chunk_count) as total_chunks,
13
+ AVG(chunk_count) as avg_chunks_per_doc,
14
+ MAX(last_updated) as latest_update
15
+ FROM knowledge_base_documents
16
+ GROUP BY source_type
17
+ ORDER BY document_count DESC;
18
+
19
+ -- ================================================================
20
+ -- 2. QUERY ANALYTICS
21
+ -- ================================================================
22
+
23
+ -- Top queries by frequency (last 30 days)
24
+ SELECT
25
+ query_text,
26
+ COUNT(*) as query_count,
27
+ AVG(latency_ms) as avg_latency_ms,
28
+ AVG(relevance_score) as avg_relevance,
29
+ COUNT(DISTINCT user_id) as unique_users
30
+ FROM rag_query_log
31
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
32
+ GROUP BY query_text
33
+ HAVING COUNT(*) > 5
34
+ ORDER BY query_count DESC
35
+ LIMIT 20;
36
+
37
+ -- ================================================================
38
+ -- 3. RETRIEVAL PERFORMANCE
39
+ -- ================================================================
40
+
41
+ -- Retrieval performance by top_k setting
42
+ SELECT
43
+ top_k,
44
+ COUNT(*) as query_count,
45
+ AVG(latency_ms) as avg_latency_ms,
46
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY latency_ms) as p50_latency,
47
+ PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY latency_ms) as p95_latency,
48
+ PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY latency_ms) as p99_latency,
49
+ AVG(relevance_score) as avg_relevance_score
50
+ FROM rag_query_log
51
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '7 days'
52
+ GROUP BY top_k
53
+ ORDER BY top_k;
54
+
55
+ -- ================================================================
56
+ -- 4. SOURCE ATTRIBUTION
57
+ -- ================================================================
58
+
59
+ -- Which documents are most frequently retrieved?
60
+ SELECT
61
+ d.document_id,
62
+ d.title,
63
+ d.source_type,
64
+ COUNT(*) as retrieval_count,
65
+ AVG(r.relevance_score) as avg_relevance,
66
+ MAX(r.query_timestamp) as last_retrieved
67
+ FROM rag_retrievals r
68
+ JOIN knowledge_base_documents d ON r.document_id = d.document_id
69
+ WHERE r.query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
70
+ GROUP BY d.document_id, d.title, d.source_type
71
+ ORDER BY retrieval_count DESC
72
+ LIMIT 50;
73
+
74
+ -- ================================================================
75
+ -- 5. USER ENGAGEMENT
76
+ -- ================================================================
77
+
78
+ -- User engagement with RAG system
79
+ SELECT
80
+ DATE_TRUNC('day', query_timestamp) as query_date,
81
+ COUNT(DISTINCT user_id) as unique_users,
82
+ COUNT(*) as total_queries,
83
+ COUNT(*) / COUNT(DISTINCT user_id) as queries_per_user,
84
+ AVG(relevance_score) as avg_relevance
85
+ FROM rag_query_log
86
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '90 days'
87
+ GROUP BY DATE_TRUNC('day', query_timestamp)
88
+ ORDER BY query_date DESC;
89
+
90
+ -- ================================================================
91
+ -- 6. CHUNK PERFORMANCE
92
+ -- ================================================================
93
+
94
+ -- Which chunk size performs best?
95
+ SELECT
96
+ c.chunk_size_range,
97
+ COUNT(DISTINCT r.query_id) as query_count,
98
+ AVG(r.relevance_score) as avg_relevance,
99
+ AVG(r.rank_position) as avg_rank
100
+ FROM rag_retrievals r
101
+ JOIN knowledge_base_chunks c ON r.chunk_id = c.chunk_id
102
+ WHERE r.query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
103
+ GROUP BY c.chunk_size_range
104
+ ORDER BY avg_relevance DESC;
105
+
106
+ -- ================================================================
107
+ -- 7. FAILED QUERIES
108
+ -- ================================================================
109
+
110
+ -- Queries with low relevance (need improvement)
111
+ SELECT
112
+ query_text,
113
+ COUNT(*) as failure_count,
114
+ AVG(relevance_score) as avg_relevance,
115
+ MIN(relevance_score) as min_relevance,
116
+ MAX(query_timestamp) as last_failed
117
+ FROM rag_query_log
118
+ WHERE relevance_score < 0.5
119
+ AND query_timestamp >= CURRENT_DATE - INTERVAL '7 days'
120
+ GROUP BY query_text
121
+ HAVING COUNT(*) > 2
122
+ ORDER BY failure_count DESC
123
+ LIMIT 30;
124
+
125
+ -- ================================================================
126
+ -- 8. EMBEDDING MODEL PERFORMANCE
127
+ -- ================================================================
128
+
129
+ -- Compare performance across embedding models
130
+ SELECT
131
+ embedding_model,
132
+ COUNT(*) as query_count,
133
+ AVG(embedding_latency_ms) as avg_embedding_latency,
134
+ AVG(retrieval_latency_ms) as avg_retrieval_latency,
135
+ AVG(relevance_score) as avg_relevance
136
+ FROM rag_query_log
137
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
138
+ GROUP BY embedding_model
139
+ ORDER BY avg_relevance DESC;
140
+
141
+ -- ================================================================
142
+ -- 9. KNOWLEDGE GAPS
143
+ -- ================================================================
144
+
145
+ -- Identify topics with no good answers
146
+ WITH poor_coverage AS (
147
+ SELECT
148
+ query_text,
149
+ COUNT(*) as frequency,
150
+ AVG(relevance_score) as avg_relevance
151
+ FROM rag_query_log
152
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
153
+ GROUP BY query_text
154
+ HAVING AVG(relevance_score) < 0.6 AND COUNT(*) > 3
155
+ )
156
+ SELECT
157
+ query_text,
158
+ frequency,
159
+ avg_relevance,
160
+ 'Add documentation' as recommendation
161
+ FROM poor_coverage
162
+ ORDER BY frequency DESC;
163
+
164
+ -- ================================================================
165
+ -- 10. RAG PIPELINE HEALTH
166
+ -- ================================================================
167
+
168
+ -- Daily RAG pipeline health metrics
169
+ SELECT
170
+ DATE(query_timestamp) as date,
171
+ COUNT(*) as total_queries,
172
+ AVG(total_latency_ms) as avg_latency_ms,
173
+ AVG(relevance_score) as avg_relevance,
174
+ PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_latency_ms) as p95_latency,
175
+ COUNT(CASE WHEN total_latency_ms > 1000 THEN 1 END) as slow_queries,
176
+ COUNT(CASE WHEN relevance_score < 0.5 THEN 1 END) as low_relevance_queries
177
+ FROM rag_query_log
178
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '14 days'
179
+ GROUP BY DATE(query_timestamp)
180
+ ORDER BY date DESC;
181
+
182
+ -- ================================================================
183
+ -- 11. VECTOR DATABASE STATISTICS
184
+ -- ================================================================
185
+
186
+ -- Vector database usage statistics
187
+ SELECT
188
+ collection_name,
189
+ COUNT(DISTINCT vector_id) as total_vectors,
190
+ AVG(vector_dimension) as avg_dimension,
191
+ MAX(last_updated) as last_updated,
192
+ SUM(storage_bytes) / (1024*1024) as storage_mb
193
+ FROM vector_database_collections
194
+ GROUP BY collection_name
195
+ ORDER BY total_vectors DESC;
196
+
197
+ -- ================================================================
198
+ -- 12. COST TRACKING
199
+ -- ================================================================
200
+
201
+ -- Estimated costs by provider
202
+ SELECT
203
+ DATE(query_timestamp) as date,
204
+ llm_provider,
205
+ COUNT(*) as query_count,
206
+ SUM(input_tokens) as total_input_tokens,
207
+ SUM(output_tokens) as total_output_tokens,
208
+ SUM(estimated_cost) as total_cost,
209
+ AVG(estimated_cost) as avg_cost_per_query
210
+ FROM rag_query_log
211
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
212
+ GROUP BY DATE(query_timestamp), llm_provider
213
+ ORDER BY date DESC, total_cost DESC;
@@ -0,0 +1,337 @@
1
+ """
2
+ Bronze Layer: Raw Data Ingestion
3
+ Ingest data from multiple sources with validation and error handling.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Dict, Any, List, Optional, Union
11
+ import pandas as pd
12
+ from pyspark.sql import SparkSession, DataFrame
13
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
14
+ from pyspark.sql import functions as F
15
+ import logging
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BronzeLoader:
22
+ """
23
+ Bronze layer ingestion with schema validation and audit logging.
24
+
25
+ Bronze layer principles:
26
+ - Append-only (preserve full history)
27
+ - Raw data with minimal transformation
28
+ - Add metadata (ingestion timestamp, source, file name)
29
+ - Schema validation
30
+ - Error quarantine
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ spark: Optional[SparkSession] = None,
36
+ bronze_path: str = "/lakehouse/bronze",
37
+ quarantine_path: str = "/lakehouse/quarantine"
38
+ ):
39
+ """
40
+ Initialize Bronze loader.
41
+
42
+ Args:
43
+ spark: SparkSession (creates one if not provided)
44
+ bronze_path: Path to bronze layer storage
45
+ quarantine_path: Path for invalid records
46
+ """
47
+ self.spark = spark or self._create_spark_session()
48
+ self.bronze_path = bronze_path
49
+ self.quarantine_path = quarantine_path
50
+
51
+ # Create directories if they don't exist
52
+ Path(bronze_path).mkdir(parents=True, exist_ok=True)
53
+ Path(quarantine_path).mkdir(parents=True, exist_ok=True)
54
+
55
+ def _create_spark_session(self) -> SparkSession:
56
+ """Create Spark session with Delta Lake support."""
57
+ return SparkSession.builder \
58
+ .appName("BronzeIngestion") \
59
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
60
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
61
+ .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
62
+ .getOrCreate()
63
+
64
+ def ingest_from_source(
65
+ self,
66
+ source_path: str,
67
+ table_name: str,
68
+ source_format: str = "json",
69
+ schema: Optional[StructType] = None,
70
+ options: Optional[Dict[str, str]] = None
71
+ ) -> Dict[str, Any]:
72
+ """
73
+ Ingest data from source into Bronze layer.
74
+
75
+ Args:
76
+ source_path: Path to source data
77
+ table_name: Name for bronze table
78
+ source_format: Format (json, csv, parquet, etc.)
79
+ schema: Optional schema to enforce
80
+ options: Additional read options
81
+
82
+ Returns:
83
+ Ingestion metrics
84
+ """
85
+ logger.info(f"Starting ingestion: {table_name} from {source_path}")
86
+
87
+ try:
88
+ # Read source data
89
+ df = self._read_source(source_path, source_format, schema, options)
90
+
91
+ # Add bronze layer metadata
92
+ df_bronze = self._add_bronze_metadata(df, source_path, table_name)
93
+
94
+ # Validate schema if provided
95
+ if schema:
96
+ df_bronze = self._validate_schema(df_bronze, schema)
97
+
98
+ # Write to bronze layer
99
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
100
+
101
+ df_bronze.write \
102
+ .format("delta") \
103
+ .mode("append") \
104
+ .option("mergeSchema", "true") \
105
+ .save(bronze_table_path)
106
+
107
+ # Collect metrics
108
+ record_count = df_bronze.count()
109
+
110
+ metrics = {
111
+ "status": "success",
112
+ "table_name": table_name,
113
+ "records_ingested": record_count,
114
+ "source_path": source_path,
115
+ "ingestion_timestamp": datetime.now().isoformat(),
116
+ "bronze_path": bronze_table_path
117
+ }
118
+
119
+ logger.info(f"✅ Successfully ingested {record_count} records to {table_name}")
120
+
121
+ return metrics
122
+
123
+ except Exception as e:
124
+ logger.error(f"❌ Ingestion failed: {str(e)}")
125
+
126
+ return {
127
+ "status": "failed",
128
+ "table_name": table_name,
129
+ "error": str(e),
130
+ "ingestion_timestamp": datetime.now().isoformat()
131
+ }
132
+
133
+ def _read_source(
134
+ self,
135
+ source_path: str,
136
+ source_format: str,
137
+ schema: Optional[StructType] = None,
138
+ options: Optional[Dict[str, str]] = None
139
+ ) -> DataFrame:
140
+ """Read data from source."""
141
+ options = options or {}
142
+
143
+ reader = self.spark.read.format(source_format)
144
+
145
+ if schema:
146
+ reader = reader.schema(schema)
147
+
148
+ for key, value in options.items():
149
+ reader = reader.option(key, value)
150
+
151
+ return reader.load(source_path)
152
+
153
+ def _add_bronze_metadata(
154
+ self,
155
+ df: DataFrame,
156
+ source_path: str,
157
+ table_name: str
158
+ ) -> DataFrame:
159
+ """Add bronze layer audit columns."""
160
+ return df \
161
+ .withColumn("_bronze_ingestion_timestamp", F.current_timestamp()) \
162
+ .withColumn("_bronze_source_path", F.lit(source_path)) \
163
+ .withColumn("_bronze_table_name", F.lit(table_name)) \
164
+ .withColumn("_bronze_ingestion_date", F.current_date())
165
+
166
+ def _validate_schema(
167
+ self,
168
+ df: DataFrame,
169
+ expected_schema: StructType
170
+ ) -> DataFrame:
171
+ """
172
+ Validate DataFrame against expected schema.
173
+
174
+ Quarantine records that don't match schema.
175
+ """
176
+ # In production, implement sophisticated schema validation
177
+ # For now, we return the df as-is
178
+ return df
179
+
180
+ def ingest_csv(
181
+ self,
182
+ csv_path: str,
183
+ table_name: str,
184
+ delimiter: str = ",",
185
+ header: bool = True,
186
+ schema: Optional[StructType] = None
187
+ ) -> Dict[str, Any]:
188
+ """Convenience method for CSV ingestion."""
189
+ options = {
190
+ "delimiter": delimiter,
191
+ "header": str(header).lower(),
192
+ "inferSchema": "true" if schema is None else "false"
193
+ }
194
+
195
+ return self.ingest_from_source(
196
+ source_path=csv_path,
197
+ table_name=table_name,
198
+ source_format="csv",
199
+ schema=schema,
200
+ options=options
201
+ )
202
+
203
+ def ingest_json(
204
+ self,
205
+ json_path: str,
206
+ table_name: str,
207
+ multiline: bool = False,
208
+ schema: Optional[StructType] = None
209
+ ) -> Dict[str, Any]:
210
+ """Convenience method for JSON ingestion."""
211
+ options = {
212
+ "multiLine": str(multiline).lower()
213
+ }
214
+
215
+ return self.ingest_from_source(
216
+ source_path=json_path,
217
+ table_name=table_name,
218
+ source_format="json",
219
+ schema=schema,
220
+ options=options
221
+ )
222
+
223
+ def ingest_parquet(
224
+ self,
225
+ parquet_path: str,
226
+ table_name: str
227
+ ) -> Dict[str, Any]:
228
+ """Convenience method for Parquet ingestion."""
229
+ return self.ingest_from_source(
230
+ source_path=parquet_path,
231
+ table_name=table_name,
232
+ source_format="parquet"
233
+ )
234
+
235
+ def create_bronze_table(
236
+ self,
237
+ table_name: str,
238
+ schema: StructType,
239
+ partition_by: Optional[List[str]] = None
240
+ ) -> None:
241
+ """Create an empty bronze table with schema."""
242
+ bronze_table_path = f"{self.bronze_path}/{table_name}"
243
+
244
+ # Create empty DataFrame with schema
245
+ empty_df = self.spark.createDataFrame([], schema)
246
+
247
+ # Add bronze metadata columns
248
+ bronze_df = self._add_bronze_metadata(empty_df, "initialized", table_name)
249
+
250
+ # Write table
251
+ writer = bronze_df.write.format("delta").mode("overwrite")
252
+
253
+ if partition_by:
254
+ writer = writer.partitionBy(*partition_by)
255
+
256
+ writer.save(bronze_table_path)
257
+
258
+ logger.info(f"✅ Created bronze table: {table_name}")
259
+
260
+
261
+ # Example CRM schema
262
+ CRM_LEADS_SCHEMA = StructType([
263
+ StructField("lead_id", StringType(), False),
264
+ StructField("email", StringType(), True),
265
+ StructField("company", StringType(), True),
266
+ StructField("industry", StringType(), True),
267
+ StructField("company_size", StringType(), True),
268
+ StructField("job_title", StringType(), True),
269
+ StructField("lead_source", StringType(), True),
270
+ StructField("created_date", TimestampType(), True),
271
+ StructField("lead_score", IntegerType(), True),
272
+ StructField("status", StringType(), True)
273
+ ])
274
+
275
+
276
+ # Example usage
277
+ if __name__ == "__main__":
278
+ print("=" * 80)
279
+ print("Bronze Layer Ingestion Demo")
280
+ print("=" * 80)
281
+
282
+ # Create sample data
283
+ sample_data = [
284
+ {
285
+ "lead_id": "L001",
286
+ "email": "john@techcorp.com",
287
+ "company": "TechCorp",
288
+ "industry": "Software",
289
+ "company_size": "100-500",
290
+ "job_title": "Data Scientist",
291
+ "lead_source": "Website",
292
+ "created_date": "2025-01-15T10:30:00",
293
+ "lead_score": 85,
294
+ "status": "New"
295
+ },
296
+ {
297
+ "lead_id": "L002",
298
+ "email": "sarah@datainc.com",
299
+ "company": "Data Inc",
300
+ "industry": "Analytics",
301
+ "company_size": "50-100",
302
+ "job_title": "ML Engineer",
303
+ "lead_source": "LinkedIn",
304
+ "created_date": "2025-01-16T14:20:00",
305
+ "lead_score": 92,
306
+ "status": "Qualified"
307
+ }
308
+ ]
309
+
310
+ # Save as JSON
311
+ sample_path = "/tmp/sample_crm_leads.json"
312
+ with open(sample_path, 'w') as f:
313
+ json.dump(sample_data, f)
314
+
315
+ # Initialize Bronze loader
316
+ bronze = BronzeLoader(
317
+ bronze_path="./lakehouse/bronze",
318
+ quarantine_path="./lakehouse/quarantine"
319
+ )
320
+
321
+ # Ingest data
322
+ metrics = bronze.ingest_json(
323
+ json_path=sample_path,
324
+ table_name="crm_leads",
325
+ multiline=True,
326
+ schema=CRM_LEADS_SCHEMA
327
+ )
328
+
329
+ print("\n📊 Ingestion Metrics:")
330
+ print(json.dumps(metrics, indent=2))
331
+
332
+ # Query bronze table
333
+ print("\n📋 Bronze Table Sample:")
334
+ bronze_df = bronze.spark.read.format("delta").load("./lakehouse/bronze/crm_leads")
335
+ bronze_df.show(truncate=False)
336
+
337
+ print(f"\nBronze table row count: {bronze_df.count()}")