tech-hub-skills 1.2.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/{LICENSE → .claude/LICENSE} +21 -21
  2. package/.claude/README.md +291 -0
  3. package/.claude/bin/cli.js +266 -0
  4. package/{bin → .claude/bin}/copilot.js +182 -182
  5. package/{bin → .claude/bin}/postinstall.js +42 -42
  6. package/{tech_hub_skills/skills → .claude/commands}/README.md +336 -336
  7. package/{tech_hub_skills/skills → .claude/commands}/ai-engineer.md +104 -104
  8. package/{tech_hub_skills/skills → .claude/commands}/aws.md +143 -143
  9. package/{tech_hub_skills/skills → .claude/commands}/azure.md +149 -149
  10. package/{tech_hub_skills/skills → .claude/commands}/backend-developer.md +108 -108
  11. package/{tech_hub_skills/skills → .claude/commands}/code-review.md +399 -399
  12. package/{tech_hub_skills/skills → .claude/commands}/compliance-automation.md +747 -747
  13. package/{tech_hub_skills/skills → .claude/commands}/compliance-officer.md +108 -108
  14. package/{tech_hub_skills/skills → .claude/commands}/data-engineer.md +113 -113
  15. package/{tech_hub_skills/skills → .claude/commands}/data-governance.md +102 -102
  16. package/{tech_hub_skills/skills → .claude/commands}/data-scientist.md +123 -123
  17. package/{tech_hub_skills/skills → .claude/commands}/database-admin.md +109 -109
  18. package/{tech_hub_skills/skills → .claude/commands}/devops.md +160 -160
  19. package/{tech_hub_skills/skills → .claude/commands}/docker.md +160 -160
  20. package/{tech_hub_skills/skills → .claude/commands}/enterprise-dashboard.md +613 -613
  21. package/{tech_hub_skills/skills → .claude/commands}/finops.md +184 -184
  22. package/{tech_hub_skills/skills → .claude/commands}/frontend-developer.md +108 -108
  23. package/{tech_hub_skills/skills → .claude/commands}/gcp.md +143 -143
  24. package/{tech_hub_skills/skills → .claude/commands}/ml-engineer.md +115 -115
  25. package/{tech_hub_skills/skills → .claude/commands}/mlops.md +187 -187
  26. package/{tech_hub_skills/skills → .claude/commands}/network-engineer.md +109 -109
  27. package/{tech_hub_skills/skills → .claude/commands}/optimization-advisor.md +329 -329
  28. package/{tech_hub_skills/skills → .claude/commands}/orchestrator.md +623 -623
  29. package/{tech_hub_skills/skills → .claude/commands}/platform-engineer.md +102 -102
  30. package/{tech_hub_skills/skills → .claude/commands}/process-automation.md +226 -226
  31. package/{tech_hub_skills/skills → .claude/commands}/process-changelog.md +184 -184
  32. package/{tech_hub_skills/skills → .claude/commands}/process-documentation.md +484 -484
  33. package/{tech_hub_skills/skills → .claude/commands}/process-kanban.md +324 -324
  34. package/{tech_hub_skills/skills → .claude/commands}/process-versioning.md +214 -214
  35. package/{tech_hub_skills/skills → .claude/commands}/product-designer.md +104 -104
  36. package/{tech_hub_skills/skills → .claude/commands}/project-starter.md +443 -443
  37. package/{tech_hub_skills/skills → .claude/commands}/qa-engineer.md +109 -109
  38. package/{tech_hub_skills/skills → .claude/commands}/security-architect.md +135 -135
  39. package/{tech_hub_skills/skills → .claude/commands}/sre.md +109 -109
  40. package/{tech_hub_skills/skills → .claude/commands}/system-design.md +126 -126
  41. package/{tech_hub_skills/skills → .claude/commands}/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -0
  43. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -0
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -0
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -0
  47. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -0
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -0
  50. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/{tech_hub_skills → .claude}/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/{tech_hub_skills → .claude}/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/{tech_hub_skills → .claude}/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/{tech_hub_skills → .claude}/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/{tech_hub_skills → .claude}/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/{tech_hub_skills → .claude}/roles/azure/skills/05-functions/README.md +264 -264
  59. package/{tech_hub_skills → .claude}/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/{tech_hub_skills → .claude}/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/{tech_hub_skills → .claude}/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/{tech_hub_skills → .claude}/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/{tech_hub_skills → .claude}/roles/azure/skills/10-networking/README.md +264 -264
  64. package/{tech_hub_skills → .claude}/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/{tech_hub_skills → .claude}/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/{tech_hub_skills → .claude}/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/{tech_hub_skills → .claude}/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/{tech_hub_skills → .claude}/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/{tech_hub_skills → .claude}/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/{tech_hub_skills → .claude}/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -0
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -0
  74. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/{tech_hub_skills → .claude}/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/{tech_hub_skills → .claude}/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/{tech_hub_skills → .claude}/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/{tech_hub_skills → .claude}/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/{tech_hub_skills → .claude}/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/{tech_hub_skills → .claude}/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/{tech_hub_skills → .claude}/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -0
  86. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/{tech_hub_skills → .claude}/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/{tech_hub_skills → .claude}/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/{tech_hub_skills → .claude}/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/{tech_hub_skills → .claude}/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/{tech_hub_skills → .claude}/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/{tech_hub_skills → .claude}/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/{tech_hub_skills → .claude}/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/{tech_hub_skills → .claude}/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/{tech_hub_skills → .claude}/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/{tech_hub_skills → .claude}/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/{tech_hub_skills → .claude}/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/{tech_hub_skills → .claude}/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/{tech_hub_skills → .claude}/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/{tech_hub_skills → .claude}/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/{tech_hub_skills → .claude}/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/{tech_hub_skills → .claude}/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/{tech_hub_skills → .claude}/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/{tech_hub_skills → .claude}/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/{tech_hub_skills → .claude}/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/{tech_hub_skills → .claude}/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/{tech_hub_skills → .claude}/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/{tech_hub_skills → .claude}/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/{tech_hub_skills → .claude}/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/{tech_hub_skills → .claude}/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/{tech_hub_skills → .claude}/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/{tech_hub_skills → .claude}/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/{tech_hub_skills → .claude}/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/{tech_hub_skills → .claude}/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/{tech_hub_skills → .claude}/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/{tech_hub_skills → .claude}/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/{tech_hub_skills → .claude}/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/{tech_hub_skills → .claude}/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/{tech_hub_skills → .claude}/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/{tech_hub_skills → .claude}/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/{tech_hub_skills → .claude}/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/{tech_hub_skills → .claude}/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/{tech_hub_skills → .claude}/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/{tech_hub_skills → .claude}/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/{tech_hub_skills → .claude}/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/{tech_hub_skills → .claude}/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/{tech_hub_skills → .claude}/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/{tech_hub_skills → .claude}/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/{tech_hub_skills → .claude}/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/{tech_hub_skills → .claude}/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/{tech_hub_skills → .claude}/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/{tech_hub_skills → .claude}/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/{tech_hub_skills → .claude}/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/{tech_hub_skills → .claude}/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/{tech_hub_skills → .claude}/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/{tech_hub_skills → .claude}/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -0
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -0
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -0
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -0
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -0
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -0
  158. package/.claude/skills/README.md +336 -0
  159. package/.claude/skills/ai-engineer.md +104 -0
  160. package/.claude/skills/aws.md +143 -0
  161. package/.claude/skills/azure.md +149 -0
  162. package/.claude/skills/backend-developer.md +108 -0
  163. package/.claude/skills/code-review.md +399 -0
  164. package/.claude/skills/compliance-automation.md +747 -0
  165. package/.claude/skills/compliance-officer.md +108 -0
  166. package/.claude/skills/data-engineer.md +113 -0
  167. package/.claude/skills/data-governance.md +102 -0
  168. package/.claude/skills/data-scientist.md +123 -0
  169. package/.claude/skills/database-admin.md +109 -0
  170. package/.claude/skills/devops.md +160 -0
  171. package/.claude/skills/docker.md +160 -0
  172. package/.claude/skills/enterprise-dashboard.md +613 -0
  173. package/.claude/skills/finops.md +184 -0
  174. package/.claude/skills/frontend-developer.md +108 -0
  175. package/.claude/skills/gcp.md +143 -0
  176. package/.claude/skills/ml-engineer.md +115 -0
  177. package/.claude/skills/mlops.md +187 -0
  178. package/.claude/skills/network-engineer.md +109 -0
  179. package/.claude/skills/optimization-advisor.md +329 -0
  180. package/.claude/skills/orchestrator.md +623 -0
  181. package/.claude/skills/platform-engineer.md +102 -0
  182. package/.claude/skills/process-automation.md +226 -0
  183. package/.claude/skills/process-changelog.md +184 -0
  184. package/.claude/skills/process-documentation.md +484 -0
  185. package/.claude/skills/process-kanban.md +324 -0
  186. package/.claude/skills/process-versioning.md +214 -0
  187. package/.claude/skills/product-designer.md +104 -0
  188. package/.claude/skills/project-starter.md +443 -0
  189. package/.claude/skills/qa-engineer.md +109 -0
  190. package/.claude/skills/security-architect.md +135 -0
  191. package/.claude/skills/sre.md +109 -0
  192. package/.claude/skills/system-design.md +126 -0
  193. package/.claude/skills/technical-writer.md +101 -0
  194. package/.gitattributes +2 -0
  195. package/GITHUB_COPILOT.md +106 -0
  196. package/README.md +192 -291
  197. package/package.json +16 -46
  198. package/bin/cli.js +0 -241
@@ -0,0 +1,336 @@
1
+ """
2
+ Advanced Document Chunking for RAG Systems
3
+ Supports semantic, recursive, and fixed-size chunking strategies.
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+ import re
10
+ from langchain.text_splitter import (
11
+ RecursiveCharacterTextSplitter,
12
+ CharacterTextSplitter,
13
+ TokenTextSplitter
14
+ )
15
+
16
+
17
+ class ChunkStrategy(Enum):
18
+ """Available chunking strategies."""
19
+ FIXED = "fixed" # Fixed character/token size
20
+ SEMANTIC = "semantic" # Semantic boundaries (paragraphs, sentences)
21
+ RECURSIVE = "recursive" # Recursive splitting with multiple separators
22
+ SLIDING_WINDOW = "sliding_window" # Overlapping windows
23
+
24
+
25
+ @dataclass
26
+ class Chunk:
27
+ """A document chunk with metadata."""
28
+ content: str
29
+ chunk_id: str
30
+ document_id: str
31
+ chunk_index: int
32
+ metadata: Dict[str, Any]
33
+ char_count: int
34
+ token_count: Optional[int] = None
35
+
36
+ def __post_init__(self):
37
+ if self.char_count == 0:
38
+ self.char_count = len(self.content)
39
+
40
+
41
+ class DocumentChunker:
42
+ """Advanced document chunker with multiple strategies."""
43
+
44
+ def __init__(
45
+ self,
46
+ strategy: ChunkStrategy = ChunkStrategy.RECURSIVE,
47
+ chunk_size: int = 1000,
48
+ chunk_overlap: int = 200,
49
+ separators: Optional[List[str]] = None
50
+ ):
51
+ """
52
+ Initialize document chunker.
53
+
54
+ Args:
55
+ strategy: Chunking strategy to use
56
+ chunk_size: Target chunk size (characters or tokens)
57
+ chunk_overlap: Overlap between chunks
58
+ separators: Custom separators for recursive splitting
59
+ """
60
+ self.strategy = strategy
61
+ self.chunk_size = chunk_size
62
+ self.chunk_overlap = chunk_overlap
63
+ self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
64
+
65
+ self._init_splitter()
66
+
67
+ def _init_splitter(self):
68
+ """Initialize the appropriate text splitter."""
69
+ if self.strategy == ChunkStrategy.RECURSIVE:
70
+ self.splitter = RecursiveCharacterTextSplitter(
71
+ chunk_size=self.chunk_size,
72
+ chunk_overlap=self.chunk_overlap,
73
+ separators=self.separators,
74
+ length_function=len
75
+ )
76
+ elif self.strategy == ChunkStrategy.FIXED:
77
+ self.splitter = CharacterTextSplitter(
78
+ chunk_size=self.chunk_size,
79
+ chunk_overlap=self.chunk_overlap,
80
+ separator="\n"
81
+ )
82
+ elif self.strategy == ChunkStrategy.SEMANTIC:
83
+ # For semantic chunking, we'll use custom logic
84
+ self.splitter = None
85
+ else:
86
+ self.splitter = RecursiveCharacterTextSplitter(
87
+ chunk_size=self.chunk_size,
88
+ chunk_overlap=self.chunk_overlap
89
+ )
90
+
91
+ def chunk_document(
92
+ self,
93
+ text: str,
94
+ document_id: str,
95
+ metadata: Optional[Dict[str, Any]] = None
96
+ ) -> List[Chunk]:
97
+ """
98
+ Chunk a document into smaller pieces.
99
+
100
+ Args:
101
+ text: Document text
102
+ document_id: Unique document identifier
103
+ metadata: Additional metadata
104
+
105
+ Returns:
106
+ List of Chunk objects
107
+ """
108
+ metadata = metadata or {}
109
+
110
+ if self.strategy == ChunkStrategy.SEMANTIC:
111
+ text_chunks = self._semantic_chunking(text)
112
+ elif self.strategy == ChunkStrategy.SLIDING_WINDOW:
113
+ text_chunks = self._sliding_window_chunking(text)
114
+ else:
115
+ text_chunks = self.splitter.split_text(text)
116
+
117
+ chunks = []
118
+ for idx, chunk_text in enumerate(text_chunks):
119
+ chunk = Chunk(
120
+ content=chunk_text,
121
+ chunk_id=f"{document_id}_chunk_{idx}",
122
+ document_id=document_id,
123
+ chunk_index=idx,
124
+ metadata={**metadata, "strategy": self.strategy.value},
125
+ char_count=len(chunk_text)
126
+ )
127
+ chunks.append(chunk)
128
+
129
+ return chunks
130
+
131
+ def _semantic_chunking(self, text: str) -> List[str]:
132
+ """
133
+ Chunk by semantic boundaries (paragraphs with context).
134
+
135
+ This strategy:
136
+ 1. Splits on paragraph boundaries
137
+ 2. Combines small paragraphs
138
+ 3. Ensures chunks don't exceed max size
139
+ """
140
+ # Split into paragraphs
141
+ paragraphs = re.split(r'\n\s*\n', text)
142
+
143
+ chunks = []
144
+ current_chunk = []
145
+ current_length = 0
146
+
147
+ for para in paragraphs:
148
+ para = para.strip()
149
+ if not para:
150
+ continue
151
+
152
+ para_length = len(para)
153
+
154
+ # If paragraph alone exceeds chunk size, split it
155
+ if para_length > self.chunk_size:
156
+ # Save current chunk if exists
157
+ if current_chunk:
158
+ chunks.append("\n\n".join(current_chunk))
159
+ current_chunk = []
160
+ current_length = 0
161
+
162
+ # Split large paragraph
163
+ sentences = re.split(r'(?<=[.!?])\s+', para)
164
+ temp_chunk = []
165
+ temp_length = 0
166
+
167
+ for sentence in sentences:
168
+ sent_length = len(sentence)
169
+ if temp_length + sent_length > self.chunk_size:
170
+ if temp_chunk:
171
+ chunks.append(" ".join(temp_chunk))
172
+ temp_chunk = [sentence]
173
+ temp_length = sent_length
174
+ else:
175
+ temp_chunk.append(sentence)
176
+ temp_length += sent_length + 1
177
+
178
+ if temp_chunk:
179
+ chunks.append(" ".join(temp_chunk))
180
+
181
+ # If adding paragraph exceeds chunk size, save current chunk
182
+ elif current_length + para_length > self.chunk_size:
183
+ if current_chunk:
184
+ chunks.append("\n\n".join(current_chunk))
185
+ current_chunk = [para]
186
+ current_length = para_length
187
+
188
+ # Otherwise, add to current chunk
189
+ else:
190
+ current_chunk.append(para)
191
+ current_length += para_length + 2 # +2 for \n\n
192
+
193
+ # Add remaining chunk
194
+ if current_chunk:
195
+ chunks.append("\n\n".join(current_chunk))
196
+
197
+ return chunks
198
+
199
+ def _sliding_window_chunking(self, text: str) -> List[str]:
200
+ """
201
+ Create overlapping chunks with sliding window.
202
+
203
+ Useful for ensuring important content at chunk boundaries isn't lost.
204
+ """
205
+ chunks = []
206
+ start = 0
207
+
208
+ while start < len(text):
209
+ end = start + self.chunk_size
210
+ chunk = text[start:end]
211
+
212
+ # Try to end at sentence boundary
213
+ if end < len(text):
214
+ last_period = chunk.rfind('. ')
215
+ if last_period > self.chunk_size // 2:
216
+ chunk = chunk[:last_period + 1]
217
+ end = start + last_period + 1
218
+
219
+ chunks.append(chunk.strip())
220
+
221
+ # Move start forward (with overlap)
222
+ start = end - self.chunk_overlap
223
+
224
+ return chunks
225
+
226
+ def chunk_multiple_documents(
227
+ self,
228
+ documents: List[Dict[str, Any]]
229
+ ) -> List[Chunk]:
230
+ """
231
+ Chunk multiple documents.
232
+
233
+ Args:
234
+ documents: List of dicts with 'id', 'text', and optional 'metadata'
235
+
236
+ Returns:
237
+ List of all chunks
238
+ """
239
+ all_chunks = []
240
+
241
+ for doc in documents:
242
+ chunks = self.chunk_document(
243
+ text=doc['text'],
244
+ document_id=doc['id'],
245
+ metadata=doc.get('metadata', {})
246
+ )
247
+ all_chunks.extend(chunks)
248
+
249
+ return all_chunks
250
+
251
+ def get_chunk_statistics(self, chunks: List[Chunk]) -> Dict[str, Any]:
252
+ """Get statistics about chunks."""
253
+ if not chunks:
254
+ return {}
255
+
256
+ char_counts = [c.char_count for c in chunks]
257
+
258
+ return {
259
+ "total_chunks": len(chunks),
260
+ "total_characters": sum(char_counts),
261
+ "avg_chunk_size": sum(char_counts) / len(chunks),
262
+ "min_chunk_size": min(char_counts),
263
+ "max_chunk_size": max(char_counts),
264
+ "unique_documents": len(set(c.document_id for c in chunks)),
265
+ "strategy": self.strategy.value
266
+ }
267
+
268
+
269
+ # Example usage
270
+ if __name__ == "__main__":
271
+ # Sample document
272
+ sample_doc = """
273
+ Marketing Campaign Analysis Best Practices
274
+
275
+ Effective marketing campaign analysis requires a systematic approach to data collection and interpretation.
276
+
277
+ Data Collection
278
+ First, ensure you're tracking the right metrics. Common KPIs include impression count, click-through rates (CTR), conversion rates, and return on ad spend (ROAS). Use tracking pixels and UTM parameters to accurately attribute conversions.
279
+
280
+ Campaign Segmentation
281
+ Break down your analysis by campaign type, channel, audience segment, and time period. This granular view helps identify what's working and what isn't. For example, email campaigns might perform better with certain demographics, while social media ads resonate with others.
282
+
283
+ Performance Benchmarking
284
+ Compare your results against industry benchmarks and historical data. A 2% CTR might seem low in isolation, but could be excellent for your industry. Track performance over time to identify trends and seasonality.
285
+
286
+ Attribution Modeling
287
+ Understand the customer journey. Did they convert after the first touchpoint or after multiple interactions? Multi-touch attribution helps allocate credit appropriately across channels.
288
+
289
+ A/B Testing
290
+ Never stop testing. Test subject lines, ad copy, images, calls-to-action, and landing pages. Use statistical significance testing to ensure your results are valid.
291
+
292
+ Reporting and Insights
293
+ Create actionable reports that tell a story. Don't just show numbers—explain what they mean and what actions should be taken. Use visualizations to make data accessible.
294
+
295
+ Continuous Optimization
296
+ Marketing is iterative. Use insights from each campaign to improve the next one. Build a knowledge base of what works for your audience.
297
+ """
298
+
299
+ print("=" * 80)
300
+ print("Document Chunking Demonstrations")
301
+ print("=" * 80)
302
+
303
+ # Test different chunking strategies
304
+ strategies = [
305
+ (ChunkStrategy.RECURSIVE, "Recursive (smart boundaries)"),
306
+ (ChunkStrategy.SEMANTIC, "Semantic (paragraph-based)"),
307
+ (ChunkStrategy.SLIDING_WINDOW, "Sliding Window (overlapping)"),
308
+ (ChunkStrategy.FIXED, "Fixed Size")
309
+ ]
310
+
311
+ for strategy, description in strategies:
312
+ print(f"\n📄 Strategy: {description}")
313
+ print("-" * 80)
314
+
315
+ chunker = DocumentChunker(
316
+ strategy=strategy,
317
+ chunk_size=300,
318
+ chunk_overlap=50
319
+ )
320
+
321
+ chunks = chunker.chunk_document(
322
+ text=sample_doc,
323
+ document_id="campaign_analysis_guide",
324
+ metadata={"category": "marketing", "author": "Tech Hub"}
325
+ )
326
+
327
+ stats = chunker.get_chunk_statistics(chunks)
328
+
329
+ print(f"Total chunks: {stats['total_chunks']}")
330
+ print(f"Avg chunk size: {stats['avg_chunk_size']:.0f} chars")
331
+ print(f"Size range: {stats['min_chunk_size']}-{stats['max_chunk_size']} chars")
332
+
333
+ print(f"\nFirst chunk preview:")
334
+ print(f"{chunks[0].content[:200]}...")
335
+
336
+ print(f"\nChunk IDs: {[c.chunk_id for c in chunks]}")
@@ -0,0 +1,213 @@
1
+ -- RAG Pipeline Analytics Queries
2
+ -- Track knowledge base usage, query patterns, and performance
3
+
4
+ -- ================================================================
5
+ -- 1. KNOWLEDGE BASE INVENTORY
6
+ -- ================================================================
7
+
8
+ -- Count documents by source
9
+ SELECT
10
+ source_type,
11
+ COUNT(*) as document_count,
12
+ SUM(chunk_count) as total_chunks,
13
+ AVG(chunk_count) as avg_chunks_per_doc,
14
+ MAX(last_updated) as latest_update
15
+ FROM knowledge_base_documents
16
+ GROUP BY source_type
17
+ ORDER BY document_count DESC;
18
+
19
+ -- ================================================================
20
+ -- 2. QUERY ANALYTICS
21
+ -- ================================================================
22
+
23
+ -- Top queries by frequency (last 30 days)
24
+ SELECT
25
+ query_text,
26
+ COUNT(*) as query_count,
27
+ AVG(latency_ms) as avg_latency_ms,
28
+ AVG(relevance_score) as avg_relevance,
29
+ COUNT(DISTINCT user_id) as unique_users
30
+ FROM rag_query_log
31
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
32
+ GROUP BY query_text
33
+ HAVING COUNT(*) > 5
34
+ ORDER BY query_count DESC
35
+ LIMIT 20;
36
+
37
+ -- ================================================================
38
+ -- 3. RETRIEVAL PERFORMANCE
39
+ -- ================================================================
40
+
41
+ -- Retrieval performance by top_k setting
42
+ SELECT
43
+ top_k,
44
+ COUNT(*) as query_count,
45
+ AVG(latency_ms) as avg_latency_ms,
46
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY latency_ms) as p50_latency,
47
+ PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY latency_ms) as p95_latency,
48
+ PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY latency_ms) as p99_latency,
49
+ AVG(relevance_score) as avg_relevance_score
50
+ FROM rag_query_log
51
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '7 days'
52
+ GROUP BY top_k
53
+ ORDER BY top_k;
54
+
55
+ -- ================================================================
56
+ -- 4. SOURCE ATTRIBUTION
57
+ -- ================================================================
58
+
59
+ -- Which documents are most frequently retrieved?
60
+ SELECT
61
+ d.document_id,
62
+ d.title,
63
+ d.source_type,
64
+ COUNT(*) as retrieval_count,
65
+ AVG(r.relevance_score) as avg_relevance,
66
+ MAX(r.query_timestamp) as last_retrieved
67
+ FROM rag_retrievals r
68
+ JOIN knowledge_base_documents d ON r.document_id = d.document_id
69
+ WHERE r.query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
70
+ GROUP BY d.document_id, d.title, d.source_type
71
+ ORDER BY retrieval_count DESC
72
+ LIMIT 50;
73
+
74
+ -- ================================================================
75
+ -- 5. USER ENGAGEMENT
76
+ -- ================================================================
77
+
78
+ -- User engagement with RAG system
79
+ SELECT
80
+ DATE_TRUNC('day', query_timestamp) as query_date,
81
+ COUNT(DISTINCT user_id) as unique_users,
82
+ COUNT(*) as total_queries,
83
+ COUNT(*) / COUNT(DISTINCT user_id) as queries_per_user,
84
+ AVG(relevance_score) as avg_relevance
85
+ FROM rag_query_log
86
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '90 days'
87
+ GROUP BY DATE_TRUNC('day', query_timestamp)
88
+ ORDER BY query_date DESC;
89
+
90
+ -- ================================================================
91
+ -- 6. CHUNK PERFORMANCE
92
+ -- ================================================================
93
+
94
+ -- Which chunk size performs best?
95
+ SELECT
96
+ c.chunk_size_range,
97
+ COUNT(DISTINCT r.query_id) as query_count,
98
+ AVG(r.relevance_score) as avg_relevance,
99
+ AVG(r.rank_position) as avg_rank
100
+ FROM rag_retrievals r
101
+ JOIN knowledge_base_chunks c ON r.chunk_id = c.chunk_id
102
+ WHERE r.query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
103
+ GROUP BY c.chunk_size_range
104
+ ORDER BY avg_relevance DESC;
105
+
106
+ -- ================================================================
107
+ -- 7. FAILED QUERIES
108
+ -- ================================================================
109
+
110
+ -- Queries with low relevance (need improvement)
111
+ SELECT
112
+ query_text,
113
+ COUNT(*) as failure_count,
114
+ AVG(relevance_score) as avg_relevance,
115
+ MIN(relevance_score) as min_relevance,
116
+ MAX(query_timestamp) as last_failed
117
+ FROM rag_query_log
118
+ WHERE relevance_score < 0.5
119
+ AND query_timestamp >= CURRENT_DATE - INTERVAL '7 days'
120
+ GROUP BY query_text
121
+ HAVING COUNT(*) > 2
122
+ ORDER BY failure_count DESC
123
+ LIMIT 30;
124
+
125
+ -- ================================================================
126
+ -- 8. EMBEDDING MODEL PERFORMANCE
127
+ -- ================================================================
128
+
129
+ -- Compare performance across embedding models
130
+ SELECT
131
+ embedding_model,
132
+ COUNT(*) as query_count,
133
+ AVG(embedding_latency_ms) as avg_embedding_latency,
134
+ AVG(retrieval_latency_ms) as avg_retrieval_latency,
135
+ AVG(relevance_score) as avg_relevance
136
+ FROM rag_query_log
137
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
138
+ GROUP BY embedding_model
139
+ ORDER BY avg_relevance DESC;
140
+
141
+ -- ================================================================
142
+ -- 9. KNOWLEDGE GAPS
143
+ -- ================================================================
144
+
145
+ -- Identify topics with no good answers
146
+ WITH poor_coverage AS (
147
+ SELECT
148
+ query_text,
149
+ COUNT(*) as frequency,
150
+ AVG(relevance_score) as avg_relevance
151
+ FROM rag_query_log
152
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
153
+ GROUP BY query_text
154
+ HAVING AVG(relevance_score) < 0.6 AND COUNT(*) > 3
155
+ )
156
+ SELECT
157
+ query_text,
158
+ frequency,
159
+ avg_relevance,
160
+ 'Add documentation' as recommendation
161
+ FROM poor_coverage
162
+ ORDER BY frequency DESC;
163
+
164
+ -- ================================================================
165
+ -- 10. RAG PIPELINE HEALTH
166
+ -- ================================================================
167
+
168
+ -- Daily RAG pipeline health metrics
169
+ SELECT
170
+ DATE(query_timestamp) as date,
171
+ COUNT(*) as total_queries,
172
+ AVG(total_latency_ms) as avg_latency_ms,
173
+ AVG(relevance_score) as avg_relevance,
174
+ PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_latency_ms) as p95_latency,
175
+ COUNT(CASE WHEN total_latency_ms > 1000 THEN 1 END) as slow_queries,
176
+ COUNT(CASE WHEN relevance_score < 0.5 THEN 1 END) as low_relevance_queries
177
+ FROM rag_query_log
178
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '14 days'
179
+ GROUP BY DATE(query_timestamp)
180
+ ORDER BY date DESC;
181
+
182
+ -- ================================================================
183
+ -- 11. VECTOR DATABASE STATISTICS
184
+ -- ================================================================
185
+
186
+ -- Vector database usage statistics
187
+ SELECT
188
+ collection_name,
189
+ COUNT(DISTINCT vector_id) as total_vectors,
190
+ AVG(vector_dimension) as avg_dimension,
191
+ MAX(last_updated) as last_updated,
192
+ SUM(storage_bytes) / (1024*1024) as storage_mb
193
+ FROM vector_database_collections
194
+ GROUP BY collection_name
195
+ ORDER BY total_vectors DESC;
196
+
197
+ -- ================================================================
198
+ -- 12. COST TRACKING
199
+ -- ================================================================
200
+
201
+ -- Estimated costs by provider
202
+ SELECT
203
+ DATE(query_timestamp) as date,
204
+ llm_provider,
205
+ COUNT(*) as query_count,
206
+ SUM(input_tokens) as total_input_tokens,
207
+ SUM(output_tokens) as total_output_tokens,
208
+ SUM(estimated_cost) as total_cost,
209
+ AVG(estimated_cost) as avg_cost_per_query
210
+ FROM rag_query_log
211
+ WHERE query_timestamp >= CURRENT_DATE - INTERVAL '30 days'
212
+ GROUP BY DATE(query_timestamp), llm_provider
213
+ ORDER BY date DESC, total_cost DESC;