tech-hub-skills 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/.claude/LICENSE +21 -21
  2. package/.claude/README.md +291 -291
  3. package/.claude/bin/cli.js +266 -266
  4. package/.claude/bin/copilot.js +182 -182
  5. package/.claude/bin/postinstall.js +42 -42
  6. package/.claude/commands/README.md +336 -336
  7. package/.claude/commands/ai-engineer.md +104 -104
  8. package/.claude/commands/aws.md +143 -143
  9. package/.claude/commands/azure.md +149 -149
  10. package/.claude/commands/backend-developer.md +108 -108
  11. package/.claude/commands/code-review.md +399 -399
  12. package/.claude/commands/compliance-automation.md +747 -747
  13. package/.claude/commands/compliance-officer.md +108 -108
  14. package/.claude/commands/data-engineer.md +113 -113
  15. package/.claude/commands/data-governance.md +102 -102
  16. package/.claude/commands/data-scientist.md +123 -123
  17. package/.claude/commands/database-admin.md +109 -109
  18. package/.claude/commands/devops.md +160 -160
  19. package/.claude/commands/docker.md +160 -160
  20. package/.claude/commands/enterprise-dashboard.md +613 -613
  21. package/.claude/commands/finops.md +184 -184
  22. package/.claude/commands/frontend-developer.md +108 -108
  23. package/.claude/commands/gcp.md +143 -143
  24. package/.claude/commands/ml-engineer.md +115 -115
  25. package/.claude/commands/mlops.md +187 -187
  26. package/.claude/commands/network-engineer.md +109 -109
  27. package/.claude/commands/optimization-advisor.md +329 -329
  28. package/.claude/commands/orchestrator.md +623 -623
  29. package/.claude/commands/platform-engineer.md +102 -102
  30. package/.claude/commands/process-automation.md +226 -226
  31. package/.claude/commands/process-changelog.md +184 -184
  32. package/.claude/commands/process-documentation.md +484 -484
  33. package/.claude/commands/process-kanban.md +324 -324
  34. package/.claude/commands/process-versioning.md +214 -214
  35. package/.claude/commands/product-designer.md +104 -104
  36. package/.claude/commands/project-starter.md +443 -443
  37. package/.claude/commands/qa-engineer.md +109 -109
  38. package/.claude/commands/security-architect.md +135 -135
  39. package/.claude/commands/sre.md +109 -109
  40. package/.claude/commands/system-design.md +126 -126
  41. package/.claude/commands/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -46
  43. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
  47. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
  50. package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/.claude/roles/azure/skills/05-functions/README.md +264 -264
  59. package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/.claude/roles/azure/skills/10-networking/README.md +264 -264
  64. package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
  74. package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
  86. package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
  158. package/.claude/skills/README.md +336 -336
  159. package/.claude/skills/ai-engineer.md +104 -104
  160. package/.claude/skills/aws.md +143 -143
  161. package/.claude/skills/azure.md +149 -149
  162. package/.claude/skills/backend-developer.md +108 -108
  163. package/.claude/skills/code-review.md +399 -399
  164. package/.claude/skills/compliance-automation.md +747 -747
  165. package/.claude/skills/compliance-officer.md +108 -108
  166. package/.claude/skills/data-engineer.md +113 -113
  167. package/.claude/skills/data-governance.md +102 -102
  168. package/.claude/skills/data-scientist.md +123 -123
  169. package/.claude/skills/database-admin.md +109 -109
  170. package/.claude/skills/devops.md +160 -160
  171. package/.claude/skills/docker.md +160 -160
  172. package/.claude/skills/enterprise-dashboard.md +613 -613
  173. package/.claude/skills/finops.md +184 -184
  174. package/.claude/skills/frontend-developer.md +108 -108
  175. package/.claude/skills/gcp.md +143 -143
  176. package/.claude/skills/ml-engineer.md +115 -115
  177. package/.claude/skills/mlops.md +187 -187
  178. package/.claude/skills/network-engineer.md +109 -109
  179. package/.claude/skills/optimization-advisor.md +329 -329
  180. package/.claude/skills/orchestrator.md +623 -623
  181. package/.claude/skills/platform-engineer.md +102 -102
  182. package/.claude/skills/process-automation.md +226 -226
  183. package/.claude/skills/process-changelog.md +184 -184
  184. package/.claude/skills/process-documentation.md +484 -484
  185. package/.claude/skills/process-kanban.md +324 -324
  186. package/.claude/skills/process-versioning.md +214 -214
  187. package/.claude/skills/product-designer.md +104 -104
  188. package/.claude/skills/project-starter.md +443 -443
  189. package/.claude/skills/qa-engineer.md +109 -109
  190. package/.claude/skills/security-architect.md +135 -135
  191. package/.claude/skills/sre.md +109 -109
  192. package/.claude/skills/system-design.md +126 -126
  193. package/.claude/skills/technical-writer.md +101 -101
  194. package/.gitattributes +2 -2
  195. package/GITHUB_COPILOT.md +106 -106
  196. package/README.md +192 -184
  197. package/package.json +16 -8
@@ -1,336 +1,336 @@
1
- """
2
- Advanced Document Chunking for RAG Systems
3
- Supports semantic, recursive, and fixed-size chunking strategies.
4
- """
5
-
6
- from typing import List, Dict, Any, Optional
7
- from dataclasses import dataclass
8
- from enum import Enum
9
- import re
10
- from langchain.text_splitter import (
11
- RecursiveCharacterTextSplitter,
12
- CharacterTextSplitter,
13
- TokenTextSplitter
14
- )
15
-
16
-
17
- class ChunkStrategy(Enum):
18
- """Available chunking strategies."""
19
- FIXED = "fixed" # Fixed character/token size
20
- SEMANTIC = "semantic" # Semantic boundaries (paragraphs, sentences)
21
- RECURSIVE = "recursive" # Recursive splitting with multiple separators
22
- SLIDING_WINDOW = "sliding_window" # Overlapping windows
23
-
24
-
25
- @dataclass
26
- class Chunk:
27
- """A document chunk with metadata."""
28
- content: str
29
- chunk_id: str
30
- document_id: str
31
- chunk_index: int
32
- metadata: Dict[str, Any]
33
- char_count: int
34
- token_count: Optional[int] = None
35
-
36
- def __post_init__(self):
37
- if self.char_count == 0:
38
- self.char_count = len(self.content)
39
-
40
-
41
- class DocumentChunker:
42
- """Advanced document chunker with multiple strategies."""
43
-
44
- def __init__(
45
- self,
46
- strategy: ChunkStrategy = ChunkStrategy.RECURSIVE,
47
- chunk_size: int = 1000,
48
- chunk_overlap: int = 200,
49
- separators: Optional[List[str]] = None
50
- ):
51
- """
52
- Initialize document chunker.
53
-
54
- Args:
55
- strategy: Chunking strategy to use
56
- chunk_size: Target chunk size (characters or tokens)
57
- chunk_overlap: Overlap between chunks
58
- separators: Custom separators for recursive splitting
59
- """
60
- self.strategy = strategy
61
- self.chunk_size = chunk_size
62
- self.chunk_overlap = chunk_overlap
63
- self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
64
-
65
- self._init_splitter()
66
-
67
- def _init_splitter(self):
68
- """Initialize the appropriate text splitter."""
69
- if self.strategy == ChunkStrategy.RECURSIVE:
70
- self.splitter = RecursiveCharacterTextSplitter(
71
- chunk_size=self.chunk_size,
72
- chunk_overlap=self.chunk_overlap,
73
- separators=self.separators,
74
- length_function=len
75
- )
76
- elif self.strategy == ChunkStrategy.FIXED:
77
- self.splitter = CharacterTextSplitter(
78
- chunk_size=self.chunk_size,
79
- chunk_overlap=self.chunk_overlap,
80
- separator="\n"
81
- )
82
- elif self.strategy == ChunkStrategy.SEMANTIC:
83
- # For semantic chunking, we'll use custom logic
84
- self.splitter = None
85
- else:
86
- self.splitter = RecursiveCharacterTextSplitter(
87
- chunk_size=self.chunk_size,
88
- chunk_overlap=self.chunk_overlap
89
- )
90
-
91
- def chunk_document(
92
- self,
93
- text: str,
94
- document_id: str,
95
- metadata: Optional[Dict[str, Any]] = None
96
- ) -> List[Chunk]:
97
- """
98
- Chunk a document into smaller pieces.
99
-
100
- Args:
101
- text: Document text
102
- document_id: Unique document identifier
103
- metadata: Additional metadata
104
-
105
- Returns:
106
- List of Chunk objects
107
- """
108
- metadata = metadata or {}
109
-
110
- if self.strategy == ChunkStrategy.SEMANTIC:
111
- text_chunks = self._semantic_chunking(text)
112
- elif self.strategy == ChunkStrategy.SLIDING_WINDOW:
113
- text_chunks = self._sliding_window_chunking(text)
114
- else:
115
- text_chunks = self.splitter.split_text(text)
116
-
117
- chunks = []
118
- for idx, chunk_text in enumerate(text_chunks):
119
- chunk = Chunk(
120
- content=chunk_text,
121
- chunk_id=f"{document_id}_chunk_{idx}",
122
- document_id=document_id,
123
- chunk_index=idx,
124
- metadata={**metadata, "strategy": self.strategy.value},
125
- char_count=len(chunk_text)
126
- )
127
- chunks.append(chunk)
128
-
129
- return chunks
130
-
131
- def _semantic_chunking(self, text: str) -> List[str]:
132
- """
133
- Chunk by semantic boundaries (paragraphs with context).
134
-
135
- This strategy:
136
- 1. Splits on paragraph boundaries
137
- 2. Combines small paragraphs
138
- 3. Ensures chunks don't exceed max size
139
- """
140
- # Split into paragraphs
141
- paragraphs = re.split(r'\n\s*\n', text)
142
-
143
- chunks = []
144
- current_chunk = []
145
- current_length = 0
146
-
147
- for para in paragraphs:
148
- para = para.strip()
149
- if not para:
150
- continue
151
-
152
- para_length = len(para)
153
-
154
- # If paragraph alone exceeds chunk size, split it
155
- if para_length > self.chunk_size:
156
- # Save current chunk if exists
157
- if current_chunk:
158
- chunks.append("\n\n".join(current_chunk))
159
- current_chunk = []
160
- current_length = 0
161
-
162
- # Split large paragraph
163
- sentences = re.split(r'(?<=[.!?])\s+', para)
164
- temp_chunk = []
165
- temp_length = 0
166
-
167
- for sentence in sentences:
168
- sent_length = len(sentence)
169
- if temp_length + sent_length > self.chunk_size:
170
- if temp_chunk:
171
- chunks.append(" ".join(temp_chunk))
172
- temp_chunk = [sentence]
173
- temp_length = sent_length
174
- else:
175
- temp_chunk.append(sentence)
176
- temp_length += sent_length + 1
177
-
178
- if temp_chunk:
179
- chunks.append(" ".join(temp_chunk))
180
-
181
- # If adding paragraph exceeds chunk size, save current chunk
182
- elif current_length + para_length > self.chunk_size:
183
- if current_chunk:
184
- chunks.append("\n\n".join(current_chunk))
185
- current_chunk = [para]
186
- current_length = para_length
187
-
188
- # Otherwise, add to current chunk
189
- else:
190
- current_chunk.append(para)
191
- current_length += para_length + 2 # +2 for \n\n
192
-
193
- # Add remaining chunk
194
- if current_chunk:
195
- chunks.append("\n\n".join(current_chunk))
196
-
197
- return chunks
198
-
199
- def _sliding_window_chunking(self, text: str) -> List[str]:
200
- """
201
- Create overlapping chunks with sliding window.
202
-
203
- Useful for ensuring important content at chunk boundaries isn't lost.
204
- """
205
- chunks = []
206
- start = 0
207
-
208
- while start < len(text):
209
- end = start + self.chunk_size
210
- chunk = text[start:end]
211
-
212
- # Try to end at sentence boundary
213
- if end < len(text):
214
- last_period = chunk.rfind('. ')
215
- if last_period > self.chunk_size // 2:
216
- chunk = chunk[:last_period + 1]
217
- end = start + last_period + 1
218
-
219
- chunks.append(chunk.strip())
220
-
221
- # Move start forward (with overlap)
222
- start = end - self.chunk_overlap
223
-
224
- return chunks
225
-
226
- def chunk_multiple_documents(
227
- self,
228
- documents: List[Dict[str, Any]]
229
- ) -> List[Chunk]:
230
- """
231
- Chunk multiple documents.
232
-
233
- Args:
234
- documents: List of dicts with 'id', 'text', and optional 'metadata'
235
-
236
- Returns:
237
- List of all chunks
238
- """
239
- all_chunks = []
240
-
241
- for doc in documents:
242
- chunks = self.chunk_document(
243
- text=doc['text'],
244
- document_id=doc['id'],
245
- metadata=doc.get('metadata', {})
246
- )
247
- all_chunks.extend(chunks)
248
-
249
- return all_chunks
250
-
251
- def get_chunk_statistics(self, chunks: List[Chunk]) -> Dict[str, Any]:
252
- """Get statistics about chunks."""
253
- if not chunks:
254
- return {}
255
-
256
- char_counts = [c.char_count for c in chunks]
257
-
258
- return {
259
- "total_chunks": len(chunks),
260
- "total_characters": sum(char_counts),
261
- "avg_chunk_size": sum(char_counts) / len(chunks),
262
- "min_chunk_size": min(char_counts),
263
- "max_chunk_size": max(char_counts),
264
- "unique_documents": len(set(c.document_id for c in chunks)),
265
- "strategy": self.strategy.value
266
- }
267
-
268
-
269
- # Example usage
270
- if __name__ == "__main__":
271
- # Sample document
272
- sample_doc = """
273
- Marketing Campaign Analysis Best Practices
274
-
275
- Effective marketing campaign analysis requires a systematic approach to data collection and interpretation.
276
-
277
- Data Collection
278
- First, ensure you're tracking the right metrics. Common KPIs include impression count, click-through rates (CTR), conversion rates, and return on ad spend (ROAS). Use tracking pixels and UTM parameters to accurately attribute conversions.
279
-
280
- Campaign Segmentation
281
- Break down your analysis by campaign type, channel, audience segment, and time period. This granular view helps identify what's working and what isn't. For example, email campaigns might perform better with certain demographics, while social media ads resonate with others.
282
-
283
- Performance Benchmarking
284
- Compare your results against industry benchmarks and historical data. A 2% CTR might seem low in isolation, but could be excellent for your industry. Track performance over time to identify trends and seasonality.
285
-
286
- Attribution Modeling
287
- Understand the customer journey. Did they convert after the first touchpoint or after multiple interactions? Multi-touch attribution helps allocate credit appropriately across channels.
288
-
289
- A/B Testing
290
- Never stop testing. Test subject lines, ad copy, images, calls-to-action, and landing pages. Use statistical significance testing to ensure your results are valid.
291
-
292
- Reporting and Insights
293
- Create actionable reports that tell a story. Don't just show numbers—explain what they mean and what actions should be taken. Use visualizations to make data accessible.
294
-
295
- Continuous Optimization
296
- Marketing is iterative. Use insights from each campaign to improve the next one. Build a knowledge base of what works for your audience.
297
- """
298
-
299
- print("=" * 80)
300
- print("Document Chunking Demonstrations")
301
- print("=" * 80)
302
-
303
- # Test different chunking strategies
304
- strategies = [
305
- (ChunkStrategy.RECURSIVE, "Recursive (smart boundaries)"),
306
- (ChunkStrategy.SEMANTIC, "Semantic (paragraph-based)"),
307
- (ChunkStrategy.SLIDING_WINDOW, "Sliding Window (overlapping)"),
308
- (ChunkStrategy.FIXED, "Fixed Size")
309
- ]
310
-
311
- for strategy, description in strategies:
312
- print(f"\n📄 Strategy: {description}")
313
- print("-" * 80)
314
-
315
- chunker = DocumentChunker(
316
- strategy=strategy,
317
- chunk_size=300,
318
- chunk_overlap=50
319
- )
320
-
321
- chunks = chunker.chunk_document(
322
- text=sample_doc,
323
- document_id="campaign_analysis_guide",
324
- metadata={"category": "marketing", "author": "Tech Hub"}
325
- )
326
-
327
- stats = chunker.get_chunk_statistics(chunks)
328
-
329
- print(f"Total chunks: {stats['total_chunks']}")
330
- print(f"Avg chunk size: {stats['avg_chunk_size']:.0f} chars")
331
- print(f"Size range: {stats['min_chunk_size']}-{stats['max_chunk_size']} chars")
332
-
333
- print(f"\nFirst chunk preview:")
334
- print(f"{chunks[0].content[:200]}...")
335
-
336
- print(f"\nChunk IDs: {[c.chunk_id for c in chunks]}")
1
+ """
2
+ Advanced Document Chunking for RAG Systems
3
+ Supports semantic, recursive, and fixed-size chunking strategies.
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+ import re
10
+ from langchain.text_splitter import (
11
+ RecursiveCharacterTextSplitter,
12
+ CharacterTextSplitter,
13
+ TokenTextSplitter
14
+ )
15
+
16
+
17
+ class ChunkStrategy(Enum):
18
+ """Available chunking strategies."""
19
+ FIXED = "fixed" # Fixed character/token size
20
+ SEMANTIC = "semantic" # Semantic boundaries (paragraphs, sentences)
21
+ RECURSIVE = "recursive" # Recursive splitting with multiple separators
22
+ SLIDING_WINDOW = "sliding_window" # Overlapping windows
23
+
24
+
25
+ @dataclass
26
+ class Chunk:
27
+ """A document chunk with metadata."""
28
+ content: str
29
+ chunk_id: str
30
+ document_id: str
31
+ chunk_index: int
32
+ metadata: Dict[str, Any]
33
+ char_count: int
34
+ token_count: Optional[int] = None
35
+
36
+ def __post_init__(self):
37
+ if self.char_count == 0:
38
+ self.char_count = len(self.content)
39
+
40
+
41
+ class DocumentChunker:
42
+ """Advanced document chunker with multiple strategies."""
43
+
44
+ def __init__(
45
+ self,
46
+ strategy: ChunkStrategy = ChunkStrategy.RECURSIVE,
47
+ chunk_size: int = 1000,
48
+ chunk_overlap: int = 200,
49
+ separators: Optional[List[str]] = None
50
+ ):
51
+ """
52
+ Initialize document chunker.
53
+
54
+ Args:
55
+ strategy: Chunking strategy to use
56
+ chunk_size: Target chunk size (characters or tokens)
57
+ chunk_overlap: Overlap between chunks
58
+ separators: Custom separators for recursive splitting
59
+ """
60
+ self.strategy = strategy
61
+ self.chunk_size = chunk_size
62
+ self.chunk_overlap = chunk_overlap
63
+ self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
64
+
65
+ self._init_splitter()
66
+
67
+ def _init_splitter(self):
68
+ """Initialize the appropriate text splitter."""
69
+ if self.strategy == ChunkStrategy.RECURSIVE:
70
+ self.splitter = RecursiveCharacterTextSplitter(
71
+ chunk_size=self.chunk_size,
72
+ chunk_overlap=self.chunk_overlap,
73
+ separators=self.separators,
74
+ length_function=len
75
+ )
76
+ elif self.strategy == ChunkStrategy.FIXED:
77
+ self.splitter = CharacterTextSplitter(
78
+ chunk_size=self.chunk_size,
79
+ chunk_overlap=self.chunk_overlap,
80
+ separator="\n"
81
+ )
82
+ elif self.strategy == ChunkStrategy.SEMANTIC:
83
+ # For semantic chunking, we'll use custom logic
84
+ self.splitter = None
85
+ else:
86
+ self.splitter = RecursiveCharacterTextSplitter(
87
+ chunk_size=self.chunk_size,
88
+ chunk_overlap=self.chunk_overlap
89
+ )
90
+
91
+ def chunk_document(
92
+ self,
93
+ text: str,
94
+ document_id: str,
95
+ metadata: Optional[Dict[str, Any]] = None
96
+ ) -> List[Chunk]:
97
+ """
98
+ Chunk a document into smaller pieces.
99
+
100
+ Args:
101
+ text: Document text
102
+ document_id: Unique document identifier
103
+ metadata: Additional metadata
104
+
105
+ Returns:
106
+ List of Chunk objects
107
+ """
108
+ metadata = metadata or {}
109
+
110
+ if self.strategy == ChunkStrategy.SEMANTIC:
111
+ text_chunks = self._semantic_chunking(text)
112
+ elif self.strategy == ChunkStrategy.SLIDING_WINDOW:
113
+ text_chunks = self._sliding_window_chunking(text)
114
+ else:
115
+ text_chunks = self.splitter.split_text(text)
116
+
117
+ chunks = []
118
+ for idx, chunk_text in enumerate(text_chunks):
119
+ chunk = Chunk(
120
+ content=chunk_text,
121
+ chunk_id=f"{document_id}_chunk_{idx}",
122
+ document_id=document_id,
123
+ chunk_index=idx,
124
+ metadata={**metadata, "strategy": self.strategy.value},
125
+ char_count=len(chunk_text)
126
+ )
127
+ chunks.append(chunk)
128
+
129
+ return chunks
130
+
131
+ def _semantic_chunking(self, text: str) -> List[str]:
132
+ """
133
+ Chunk by semantic boundaries (paragraphs with context).
134
+
135
+ This strategy:
136
+ 1. Splits on paragraph boundaries
137
+ 2. Combines small paragraphs
138
+ 3. Ensures chunks don't exceed max size
139
+ """
140
+ # Split into paragraphs
141
+ paragraphs = re.split(r'\n\s*\n', text)
142
+
143
+ chunks = []
144
+ current_chunk = []
145
+ current_length = 0
146
+
147
+ for para in paragraphs:
148
+ para = para.strip()
149
+ if not para:
150
+ continue
151
+
152
+ para_length = len(para)
153
+
154
+ # If paragraph alone exceeds chunk size, split it
155
+ if para_length > self.chunk_size:
156
+ # Save current chunk if exists
157
+ if current_chunk:
158
+ chunks.append("\n\n".join(current_chunk))
159
+ current_chunk = []
160
+ current_length = 0
161
+
162
+ # Split large paragraph
163
+ sentences = re.split(r'(?<=[.!?])\s+', para)
164
+ temp_chunk = []
165
+ temp_length = 0
166
+
167
+ for sentence in sentences:
168
+ sent_length = len(sentence)
169
+ if temp_length + sent_length > self.chunk_size:
170
+ if temp_chunk:
171
+ chunks.append(" ".join(temp_chunk))
172
+ temp_chunk = [sentence]
173
+ temp_length = sent_length
174
+ else:
175
+ temp_chunk.append(sentence)
176
+ temp_length += sent_length + 1
177
+
178
+ if temp_chunk:
179
+ chunks.append(" ".join(temp_chunk))
180
+
181
+ # If adding paragraph exceeds chunk size, save current chunk
182
+ elif current_length + para_length > self.chunk_size:
183
+ if current_chunk:
184
+ chunks.append("\n\n".join(current_chunk))
185
+ current_chunk = [para]
186
+ current_length = para_length
187
+
188
+ # Otherwise, add to current chunk
189
+ else:
190
+ current_chunk.append(para)
191
+ current_length += para_length + 2 # +2 for \n\n
192
+
193
+ # Add remaining chunk
194
+ if current_chunk:
195
+ chunks.append("\n\n".join(current_chunk))
196
+
197
+ return chunks
198
+
199
+ def _sliding_window_chunking(self, text: str) -> List[str]:
200
+ """
201
+ Create overlapping chunks with sliding window.
202
+
203
+ Useful for ensuring important content at chunk boundaries isn't lost.
204
+ """
205
+ chunks = []
206
+ start = 0
207
+
208
+ while start < len(text):
209
+ end = start + self.chunk_size
210
+ chunk = text[start:end]
211
+
212
+ # Try to end at sentence boundary
213
+ if end < len(text):
214
+ last_period = chunk.rfind('. ')
215
+ if last_period > self.chunk_size // 2:
216
+ chunk = chunk[:last_period + 1]
217
+ end = start + last_period + 1
218
+
219
+ chunks.append(chunk.strip())
220
+
221
+ # Move start forward (with overlap)
222
+ start = end - self.chunk_overlap
223
+
224
+ return chunks
225
+
226
+ def chunk_multiple_documents(
227
+ self,
228
+ documents: List[Dict[str, Any]]
229
+ ) -> List[Chunk]:
230
+ """
231
+ Chunk multiple documents.
232
+
233
+ Args:
234
+ documents: List of dicts with 'id', 'text', and optional 'metadata'
235
+
236
+ Returns:
237
+ List of all chunks
238
+ """
239
+ all_chunks = []
240
+
241
+ for doc in documents:
242
+ chunks = self.chunk_document(
243
+ text=doc['text'],
244
+ document_id=doc['id'],
245
+ metadata=doc.get('metadata', {})
246
+ )
247
+ all_chunks.extend(chunks)
248
+
249
+ return all_chunks
250
+
251
+ def get_chunk_statistics(self, chunks: List[Chunk]) -> Dict[str, Any]:
252
+ """Get statistics about chunks."""
253
+ if not chunks:
254
+ return {}
255
+
256
+ char_counts = [c.char_count for c in chunks]
257
+
258
+ return {
259
+ "total_chunks": len(chunks),
260
+ "total_characters": sum(char_counts),
261
+ "avg_chunk_size": sum(char_counts) / len(chunks),
262
+ "min_chunk_size": min(char_counts),
263
+ "max_chunk_size": max(char_counts),
264
+ "unique_documents": len(set(c.document_id for c in chunks)),
265
+ "strategy": self.strategy.value
266
+ }
267
+
268
+
269
+ # Example usage
270
+ if __name__ == "__main__":
271
+ # Sample document
272
+ sample_doc = """
273
+ Marketing Campaign Analysis Best Practices
274
+
275
+ Effective marketing campaign analysis requires a systematic approach to data collection and interpretation.
276
+
277
+ Data Collection
278
+ First, ensure you're tracking the right metrics. Common KPIs include impression count, click-through rates (CTR), conversion rates, and return on ad spend (ROAS). Use tracking pixels and UTM parameters to accurately attribute conversions.
279
+
280
+ Campaign Segmentation
281
+ Break down your analysis by campaign type, channel, audience segment, and time period. This granular view helps identify what's working and what isn't. For example, email campaigns might perform better with certain demographics, while social media ads resonate with others.
282
+
283
+ Performance Benchmarking
284
+ Compare your results against industry benchmarks and historical data. A 2% CTR might seem low in isolation, but could be excellent for your industry. Track performance over time to identify trends and seasonality.
285
+
286
+ Attribution Modeling
287
+ Understand the customer journey. Did they convert after the first touchpoint or after multiple interactions? Multi-touch attribution helps allocate credit appropriately across channels.
288
+
289
+ A/B Testing
290
+ Never stop testing. Test subject lines, ad copy, images, calls-to-action, and landing pages. Use statistical significance testing to ensure your results are valid.
291
+
292
+ Reporting and Insights
293
+ Create actionable reports that tell a story. Don't just show numbers—explain what they mean and what actions should be taken. Use visualizations to make data accessible.
294
+
295
+ Continuous Optimization
296
+ Marketing is iterative. Use insights from each campaign to improve the next one. Build a knowledge base of what works for your audience.
297
+ """
298
+
299
+ print("=" * 80)
300
+ print("Document Chunking Demonstrations")
301
+ print("=" * 80)
302
+
303
+ # Test different chunking strategies
304
+ strategies = [
305
+ (ChunkStrategy.RECURSIVE, "Recursive (smart boundaries)"),
306
+ (ChunkStrategy.SEMANTIC, "Semantic (paragraph-based)"),
307
+ (ChunkStrategy.SLIDING_WINDOW, "Sliding Window (overlapping)"),
308
+ (ChunkStrategy.FIXED, "Fixed Size")
309
+ ]
310
+
311
+ for strategy, description in strategies:
312
+ print(f"\n📄 Strategy: {description}")
313
+ print("-" * 80)
314
+
315
+ chunker = DocumentChunker(
316
+ strategy=strategy,
317
+ chunk_size=300,
318
+ chunk_overlap=50
319
+ )
320
+
321
+ chunks = chunker.chunk_document(
322
+ text=sample_doc,
323
+ document_id="campaign_analysis_guide",
324
+ metadata={"category": "marketing", "author": "Tech Hub"}
325
+ )
326
+
327
+ stats = chunker.get_chunk_statistics(chunks)
328
+
329
+ print(f"Total chunks: {stats['total_chunks']}")
330
+ print(f"Avg chunk size: {stats['avg_chunk_size']:.0f} chars")
331
+ print(f"Size range: {stats['min_chunk_size']}-{stats['max_chunk_size']} chars")
332
+
333
+ print(f"\nFirst chunk preview:")
334
+ print(f"{chunks[0].content[:200]}...")
335
+
336
+ print(f"\nChunk IDs: {[c.chunk_id for c in chunks]}")