@sylix/coworker 2.0.11 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/dist/commands/slash/config.d.ts.map +1 -1
  2. package/dist/commands/slash/config.js +22 -4
  3. package/dist/commands/slash/config.js.map +1 -1
  4. package/dist/core/CoWorkerAgent.d.ts.map +1 -1
  5. package/dist/core/CoWorkerAgent.js +6 -3
  6. package/dist/core/CoWorkerAgent.js.map +1 -1
  7. package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
  8. package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
  9. package/dist/skills/defaults/ai-ml/rag.md +276 -0
  10. package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
  11. package/dist/skills/defaults/backend-development/api-design.md +285 -0
  12. package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
  13. package/dist/skills/defaults/backend-development/async-python.md +237 -0
  14. package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
  15. package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
  16. package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
  17. package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
  18. package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
  19. package/dist/skills/defaults/backend-development/database-design.md +305 -0
  20. package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
  21. package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
  22. package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
  23. package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
  24. package/dist/skills/defaults/backend-development/fastapi.md +309 -0
  25. package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
  26. package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
  27. package/dist/skills/defaults/backend-development/microservices.md +284 -0
  28. package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
  29. package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
  30. package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
  31. package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
  32. package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
  33. package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
  34. package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
  35. package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
  36. package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
  37. package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
  38. package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
  39. package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
  40. package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
  41. package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
  42. package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
  43. package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
  44. package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
  45. package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
  46. package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
  47. package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
  48. package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
  49. package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
  50. package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
  51. package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
  52. package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
  53. package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
  54. package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
  55. package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
  56. package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
  57. package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
  58. package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
  59. package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
  60. package/dist/skills/defaults/data-engineering/airflow.md +519 -0
  61. package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
  62. package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
  63. package/dist/skills/defaults/data-engineering/dbt.md +556 -0
  64. package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
  65. package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
  66. package/dist/skills/defaults/data-engineering/spark.md +411 -0
  67. package/dist/skills/defaults/database/postgresql.md +202 -0
  68. package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
  69. package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
  70. package/dist/skills/defaults/devops/changelog-automation.md +580 -0
  71. package/dist/skills/defaults/devops/cicd.md +314 -0
  72. package/dist/skills/defaults/devops/cloud.md +263 -0
  73. package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
  74. package/dist/skills/defaults/devops/cost-optimization.md +295 -0
  75. package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
  76. package/dist/skills/defaults/devops/docker.md +281 -0
  77. package/dist/skills/defaults/devops/git-workflows.md +205 -0
  78. package/dist/skills/defaults/devops/github-actions.md +311 -0
  79. package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
  80. package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
  81. package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
  82. package/dist/skills/defaults/devops/kubernetes.md +339 -0
  83. package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
  84. package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
  85. package/dist/skills/defaults/devops/observability.md +243 -0
  86. package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
  87. package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
  88. package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
  89. package/dist/skills/defaults/devops/secrets-management.md +341 -0
  90. package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
  91. package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
  92. package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
  93. package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
  94. package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
  95. package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
  96. package/dist/skills/defaults/frontend/interaction-design.md +327 -0
  97. package/dist/skills/defaults/frontend/javascript.md +311 -0
  98. package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
  99. package/dist/skills/defaults/frontend/react-native-design.md +440 -0
  100. package/dist/skills/defaults/frontend/react.md +345 -0
  101. package/dist/skills/defaults/frontend/responsive-design.md +472 -0
  102. package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
  103. package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
  104. package/dist/skills/defaults/frontend/typescript.md +334 -0
  105. package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
  106. package/dist/skills/defaults/frontend/web-component-design.md +279 -0
  107. package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
  108. package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
  109. package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
  110. package/dist/skills/defaults/kubernetes/gitops.md +280 -0
  111. package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
  112. package/dist/skills/defaults/kubernetes/helm.md +343 -0
  113. package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
  114. package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
  115. package/dist/skills/defaults/kubernetes/manifests.md +330 -0
  116. package/dist/skills/defaults/kubernetes/security.md +337 -0
  117. package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
  118. package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
  119. package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
  120. package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
  121. package/dist/skills/defaults/llm-application/langchain.md +259 -0
  122. package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
  123. package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
  124. package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
  125. package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
  126. package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
  127. package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
  128. package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
  129. package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
  130. package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
  131. package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
  132. package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
  133. package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
  134. package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
  135. package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
  136. package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
  137. package/dist/skills/defaults/security/auditor.md +168 -0
  138. package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
  139. package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
  140. package/dist/skills/defaults/security/mtls-configuration.md +349 -0
  141. package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
  142. package/dist/skills/defaults/security/sast-configuration.md +182 -0
  143. package/dist/skills/defaults/security/security.md +313 -0
  144. package/dist/skills/defaults/security/stride-analysis.md +273 -0
  145. package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
  146. package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
  147. package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
  148. package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
  149. package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
  150. package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
  151. package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
  152. package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
  153. package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
  154. package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
  155. package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
  156. package/dist/skills/defaults/testing/testing.md +332 -0
  157. package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
  158. package/dist/skills/defaults/workflows/track-management.md +592 -0
  159. package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
  160. package/dist/skills/index.d.ts +11 -0
  161. package/dist/skills/index.d.ts.map +1 -0
  162. package/dist/skills/index.js +129 -0
  163. package/dist/skills/index.js.map +1 -0
  164. package/dist/utils/character.js +4 -4
  165. package/dist/utils/character.js.map +1 -1
  166. package/dist/utils/inputbar.d.ts.map +1 -1
  167. package/dist/utils/inputbar.js +7 -0
  168. package/dist/utils/inputbar.js.map +1 -1
  169. package/package.json +1 -1
@@ -0,0 +1,608 @@
1
+ ---
2
+ name: embedding-strategies
3
+ description: Select and optimize embedding models for semantic search and RAG applications. Use when choosing embedding models, implementing chunking strategies, or optimizing embedding quality for specific domains.
4
+ ---
5
+
6
+ # Embedding Strategies
7
+
8
+ Guide to selecting and optimizing embedding models for vector search applications.
9
+
10
+ ## When to Use This Skill
11
+
12
+ - Choosing embedding models for RAG
13
+ - Optimizing chunking strategies
14
+ - Fine-tuning embeddings for domains
15
+ - Comparing embedding model performance
16
+ - Reducing embedding dimensions
17
+ - Handling multilingual content
18
+
19
+ ## Core Concepts
20
+
21
+ ### 1. Embedding Model Comparison (2026)
22
+
23
+ | Model | Dimensions | Max Tokens | Best For |
24
+ | -------------------------- | ---------- | ---------- | ----------------------------------- |
25
+ | **voyage-3-large** | 1024 | 32000 | Claude apps (Anthropic recommended) |
26
+ | **voyage-3** | 1024 | 32000 | Claude apps, cost-effective |
27
+ | **voyage-code-3** | 1024 | 32000 | Code search |
28
+ | **voyage-finance-2** | 1024 | 32000 | Financial documents |
29
+ | **voyage-law-2** | 1024 | 32000 | Legal documents |
30
+ | **text-embedding-3-large** | 3072 | 8191 | OpenAI apps, high accuracy |
31
+ | **text-embedding-3-small** | 1536 | 8191 | OpenAI apps, cost-effective |
32
+ | **bge-large-en-v1.5** | 1024 | 512 | Open source, local deployment |
33
+ | **all-MiniLM-L6-v2** | 384 | 256 | Fast, lightweight |
34
+ | **multilingual-e5-large** | 1024 | 512 | Multi-language |
35
+
36
+ ### 2. Embedding Pipeline
37
+
38
+ ```
39
+ Document → Chunking → Preprocessing → Embedding Model → Vector
40
+
41
+ [Overlap, Size] [Clean, Normalize] [API/Local]
42
+ ```
43
+
44
+ ## Templates
45
+
46
+ ### Template 1: Voyage AI Embeddings (Recommended for Claude)
47
+
48
+ ```python
49
+ from langchain_voyageai import VoyageAIEmbeddings
50
+ from typing import List
51
+ import os
52
+
53
+ # Initialize Voyage AI embeddings (recommended by Anthropic for Claude)
54
+ embeddings = VoyageAIEmbeddings(
55
+ model="voyage-3-large",
56
+ voyage_api_key=os.environ.get("VOYAGE_API_KEY")
57
+ )
58
+
59
+ def get_embeddings(texts: List[str]) -> List[List[float]]:
60
+ """Get embeddings from Voyage AI."""
61
+ return embeddings.embed_documents(texts)
62
+
63
+ def get_query_embedding(query: str) -> List[float]:
64
+ """Get single query embedding."""
65
+ return embeddings.embed_query(query)
66
+
67
+ # Specialized models for domains
68
+ code_embeddings = VoyageAIEmbeddings(model="voyage-code-3")
69
+ finance_embeddings = VoyageAIEmbeddings(model="voyage-finance-2")
70
+ legal_embeddings = VoyageAIEmbeddings(model="voyage-law-2")
71
+ ```
72
+
73
+ ### Template 2: OpenAI Embeddings
74
+
75
+ ```python
76
+ from openai import OpenAI
77
+ from typing import List
78
+ import numpy as np
79
+
80
+ client = OpenAI()
81
+
82
+ def get_embeddings(
83
+ texts: List[str],
84
+ model: str = "text-embedding-3-small",
85
+ dimensions: int = None
86
+ ) -> List[List[float]]:
87
+ """Get embeddings from OpenAI with optional dimension reduction."""
88
+ # Handle batching for large lists
89
+ batch_size = 100
90
+ all_embeddings = []
91
+
92
+ for i in range(0, len(texts), batch_size):
93
+ batch = texts[i:i + batch_size]
94
+
95
+ kwargs = {"input": batch, "model": model}
96
+ if dimensions:
97
+ # Matryoshka dimensionality reduction
98
+ kwargs["dimensions"] = dimensions
99
+
100
+ response = client.embeddings.create(**kwargs)
101
+ embeddings = [item.embedding for item in response.data]
102
+ all_embeddings.extend(embeddings)
103
+
104
+ return all_embeddings
105
+
106
+
107
+ def get_embedding(text: str, **kwargs) -> List[float]:
108
+ """Get single embedding."""
109
+ return get_embeddings([text], **kwargs)[0]
110
+
111
+
112
+ # Dimension reduction with Matryoshka embeddings
113
+ def get_reduced_embedding(text: str, dimensions: int = 512) -> List[float]:
114
+ """Get embedding with reduced dimensions (Matryoshka)."""
115
+ return get_embedding(
116
+ text,
117
+ model="text-embedding-3-small",
118
+ dimensions=dimensions
119
+ )
120
+ ```
121
+
122
+ ### Template 3: Local Embeddings with Sentence Transformers
123
+
124
+ ```python
125
+ from sentence_transformers import SentenceTransformer
126
+ from typing import List, Optional
127
+ import numpy as np
128
+
129
+ class LocalEmbedder:
130
+ """Local embedding with sentence-transformers."""
131
+
132
+ def __init__(
133
+ self,
134
+ model_name: str = "BAAI/bge-large-en-v1.5",
135
+ device: str = "cuda"
136
+ ):
137
+ self.model = SentenceTransformer(model_name, device=device)
138
+ self.model_name = model_name
139
+
140
+ def embed(
141
+ self,
142
+ texts: List[str],
143
+ normalize: bool = True,
144
+ show_progress: bool = False
145
+ ) -> np.ndarray:
146
+ """Embed texts with optional normalization."""
147
+ embeddings = self.model.encode(
148
+ texts,
149
+ normalize_embeddings=normalize,
150
+ show_progress_bar=show_progress,
151
+ convert_to_numpy=True
152
+ )
153
+ return embeddings
154
+
155
+ def embed_query(self, query: str) -> np.ndarray:
156
+ """Embed a query with appropriate prefix for retrieval models."""
157
+ # BGE and similar models benefit from query prefix
158
+ if "bge" in self.model_name.lower():
159
+ query = f"Represent this sentence for searching relevant passages: {query}"
160
+ return self.embed([query])[0]
161
+
162
+ def embed_documents(self, documents: List[str]) -> np.ndarray:
163
+ """Embed documents for indexing."""
164
+ return self.embed(documents)
165
+
166
+
167
+ # E5 model with instructions
168
+ class E5Embedder:
169
+ def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
170
+ self.model = SentenceTransformer(model_name)
171
+
172
+ def embed_query(self, query: str) -> np.ndarray:
173
+ """E5 requires 'query:' prefix for queries."""
174
+ return self.model.encode(f"query: {query}")
175
+
176
+ def embed_document(self, document: str) -> np.ndarray:
177
+ """E5 requires 'passage:' prefix for documents."""
178
+ return self.model.encode(f"passage: {document}")
179
+ ```
180
+
181
+ ### Template 4: Chunking Strategies
182
+
183
+ ```python
184
+ from typing import List, Tuple
185
+ import re
186
+
187
+ def chunk_by_tokens(
188
+ text: str,
189
+ chunk_size: int = 512,
190
+ chunk_overlap: int = 50,
191
+ tokenizer=None
192
+ ) -> List[str]:
193
+ """Chunk text by token count."""
194
+ import tiktoken
195
+ tokenizer = tokenizer or tiktoken.get_encoding("cl100k_base")
196
+
197
+ tokens = tokenizer.encode(text)
198
+ chunks = []
199
+
200
+ start = 0
201
+ while start < len(tokens):
202
+ end = start + chunk_size
203
+ chunk_tokens = tokens[start:end]
204
+ chunk_text = tokenizer.decode(chunk_tokens)
205
+ chunks.append(chunk_text)
206
+ start = end - chunk_overlap
207
+
208
+ return chunks
209
+
210
+
211
+ def chunk_by_sentences(
212
+ text: str,
213
+ max_chunk_size: int = 1000,
214
+ min_chunk_size: int = 100
215
+ ) -> List[str]:
216
+ """Chunk text by sentences, respecting size limits."""
217
+ import nltk
218
+ sentences = nltk.sent_tokenize(text)
219
+
220
+ chunks = []
221
+ current_chunk = []
222
+ current_size = 0
223
+
224
+ for sentence in sentences:
225
+ sentence_size = len(sentence)
226
+
227
+ if current_size + sentence_size > max_chunk_size and current_chunk:
228
+ chunks.append(" ".join(current_chunk))
229
+ current_chunk = []
230
+ current_size = 0
231
+
232
+ current_chunk.append(sentence)
233
+ current_size += sentence_size
234
+
235
+ if current_chunk:
236
+ chunks.append(" ".join(current_chunk))
237
+
238
+ return chunks
239
+
240
+
241
+ def chunk_by_semantic_sections(
242
+ text: str,
243
+ headers_pattern: str = r'^#{1,3}\s+.+$'
244
+ ) -> List[Tuple[str, str]]:
245
+ """Chunk markdown by headers, preserving hierarchy."""
246
+ lines = text.split('\n')
247
+ chunks = []
248
+ current_header = ""
249
+ current_content = []
250
+
251
+ for line in lines:
252
+ if re.match(headers_pattern, line, re.MULTILINE):
253
+ if current_content:
254
+ chunks.append((current_header, '\n'.join(current_content)))
255
+ current_header = line
256
+ current_content = []
257
+ else:
258
+ current_content.append(line)
259
+
260
+ if current_content:
261
+ chunks.append((current_header, '\n'.join(current_content)))
262
+
263
+ return chunks
264
+
265
+
266
+ def recursive_character_splitter(
267
+ text: str,
268
+ chunk_size: int = 1000,
269
+ chunk_overlap: int = 200,
270
+ separators: List[str] = None
271
+ ) -> List[str]:
272
+ """LangChain-style recursive splitter."""
273
+ separators = separators or ["\n\n", "\n", ". ", " ", ""]
274
+
275
+ def split_text(text: str, separators: List[str]) -> List[str]:
276
+ if not text:
277
+ return []
278
+
279
+ separator = separators[0]
280
+ remaining_separators = separators[1:]
281
+
282
+ if separator == "":
283
+ # Character-level split
284
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
285
+
286
+ splits = text.split(separator)
287
+ chunks = []
288
+ current_chunk = []
289
+ current_length = 0
290
+
291
+ for split in splits:
292
+ split_length = len(split) + len(separator)
293
+
294
+ if current_length + split_length > chunk_size and current_chunk:
295
+ chunk_text = separator.join(current_chunk)
296
+
297
+ # Recursively split if still too large
298
+ if len(chunk_text) > chunk_size and remaining_separators:
299
+ chunks.extend(split_text(chunk_text, remaining_separators))
300
+ else:
301
+ chunks.append(chunk_text)
302
+
303
+ # Start new chunk with overlap
304
+ overlap_splits = []
305
+ overlap_length = 0
306
+ for s in reversed(current_chunk):
307
+ if overlap_length + len(s) <= chunk_overlap:
308
+ overlap_splits.insert(0, s)
309
+ overlap_length += len(s)
310
+ else:
311
+ break
312
+ current_chunk = overlap_splits
313
+ current_length = overlap_length
314
+
315
+ current_chunk.append(split)
316
+ current_length += split_length
317
+
318
+ if current_chunk:
319
+ chunks.append(separator.join(current_chunk))
320
+
321
+ return chunks
322
+
323
+ return split_text(text, separators)
324
+ ```
325
+
326
+ ### Template 5: Domain-Specific Embedding Pipeline
327
+
328
+ ```python
329
+ import re
330
+ from typing import List, Optional
331
+ from dataclasses import dataclass
332
+
333
+ @dataclass
334
+ class EmbeddedDocument:
335
+ id: str
336
+ document_id: str
337
+ chunk_index: int
338
+ text: str
339
+ embedding: List[float]
340
+ metadata: dict
341
+
342
+ class DomainEmbeddingPipeline:
343
+ """Pipeline for domain-specific embeddings."""
344
+
345
+ def __init__(
346
+ self,
347
+ embedding_model: str = "voyage-3-large",
348
+ chunk_size: int = 512,
349
+ chunk_overlap: int = 50,
350
+ preprocessing_fn=None
351
+ ):
352
+ self.embeddings = VoyageAIEmbeddings(model=embedding_model)
353
+ self.chunk_size = chunk_size
354
+ self.chunk_overlap = chunk_overlap
355
+ self.preprocess = preprocessing_fn or self._default_preprocess
356
+
357
+ def _default_preprocess(self, text: str) -> str:
358
+ """Default preprocessing."""
359
+ # Remove excessive whitespace
360
+ text = re.sub(r'\s+', ' ', text)
361
+ # Remove special characters (customize for your domain)
362
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
363
+ return text.strip()
364
+
365
+ async def process_documents(
366
+ self,
367
+ documents: List[dict],
368
+ id_field: str = "id",
369
+ content_field: str = "content",
370
+ metadata_fields: Optional[List[str]] = None
371
+ ) -> List[EmbeddedDocument]:
372
+ """Process documents for vector storage."""
373
+ processed = []
374
+
375
+ for doc in documents:
376
+ content = doc[content_field]
377
+ doc_id = doc[id_field]
378
+
379
+ # Preprocess
380
+ cleaned = self.preprocess(content)
381
+
382
+ # Chunk
383
+ chunks = chunk_by_tokens(
384
+ cleaned,
385
+ self.chunk_size,
386
+ self.chunk_overlap
387
+ )
388
+
389
+ # Create embeddings
390
+ embeddings = await self.embeddings.aembed_documents(chunks)
391
+
392
+ # Create records
393
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
394
+ metadata = {"document_id": doc_id, "chunk_index": i}
395
+
396
+ # Add specified metadata fields
397
+ if metadata_fields:
398
+ for field in metadata_fields:
399
+ if field in doc:
400
+ metadata[field] = doc[field]
401
+
402
+ processed.append(EmbeddedDocument(
403
+ id=f"{doc_id}_chunk_{i}",
404
+ document_id=doc_id,
405
+ chunk_index=i,
406
+ text=chunk,
407
+ embedding=embedding,
408
+ metadata=metadata
409
+ ))
410
+
411
+ return processed
412
+
413
+
414
+ # Code-specific pipeline
415
+ class CodeEmbeddingPipeline:
416
+ """Specialized pipeline for code embeddings."""
417
+
418
+ def __init__(self):
419
+ # Use Voyage's code-specific model
420
+ self.embeddings = VoyageAIEmbeddings(model="voyage-code-3")
421
+
422
+ def chunk_code(self, code: str, language: str) -> List[dict]:
423
+ """Chunk code by functions/classes using tree-sitter."""
424
+ try:
425
+ import tree_sitter_languages
426
+ parser = tree_sitter_languages.get_parser(language)
427
+ tree = parser.parse(bytes(code, "utf8"))
428
+
429
+ chunks = []
430
+ # Extract function and class definitions
431
+ self._extract_nodes(tree.root_node, code, chunks)
432
+ return chunks
433
+ except ImportError:
434
+ # Fallback to simple chunking
435
+ return [{"text": code, "type": "module"}]
436
+
437
+ def _extract_nodes(self, node, source_code: str, chunks: list):
438
+ """Recursively extract function/class definitions."""
439
+ if node.type in ['function_definition', 'class_definition', 'method_definition']:
440
+ text = source_code[node.start_byte:node.end_byte]
441
+ chunks.append({
442
+ "text": text,
443
+ "type": node.type,
444
+ "name": self._get_name(node),
445
+ "start_line": node.start_point[0],
446
+ "end_line": node.end_point[0]
447
+ })
448
+ for child in node.children:
449
+ self._extract_nodes(child, source_code, chunks)
450
+
451
+ def _get_name(self, node) -> str:
452
+ """Extract name from function/class node."""
453
+ for child in node.children:
454
+ if child.type == 'identifier' or child.type == 'name':
455
+ return child.text.decode('utf8')
456
+ return "unknown"
457
+
458
+ async def embed_with_context(
459
+ self,
460
+ chunk: str,
461
+ context: str = ""
462
+ ) -> List[float]:
463
+ """Embed code with surrounding context."""
464
+ if context:
465
+ combined = f"Context: {context}\n\nCode:\n{chunk}"
466
+ else:
467
+ combined = chunk
468
+ return await self.embeddings.aembed_query(combined)
469
+ ```
470
+
471
+ ### Template 6: Embedding Quality Evaluation
472
+
473
+ ```python
474
+ import numpy as np
475
+ from typing import List, Dict
476
+
477
+ def evaluate_retrieval_quality(
478
+ queries: List[str],
479
+ relevant_docs: List[List[str]], # List of relevant doc IDs per query
480
+ retrieved_docs: List[List[str]], # List of retrieved doc IDs per query
481
+ k: int = 10
482
+ ) -> Dict[str, float]:
483
+ """Evaluate embedding quality for retrieval."""
484
+
485
+ def precision_at_k(relevant: set, retrieved: List[str], k: int) -> float:
486
+ retrieved_k = retrieved[:k]
487
+ relevant_retrieved = len(set(retrieved_k) & relevant)
488
+ return relevant_retrieved / k if k > 0 else 0
489
+
490
+ def recall_at_k(relevant: set, retrieved: List[str], k: int) -> float:
491
+ retrieved_k = retrieved[:k]
492
+ relevant_retrieved = len(set(retrieved_k) & relevant)
493
+ return relevant_retrieved / len(relevant) if relevant else 0
494
+
495
+ def mrr(relevant: set, retrieved: List[str]) -> float:
496
+ for i, doc in enumerate(retrieved):
497
+ if doc in relevant:
498
+ return 1 / (i + 1)
499
+ return 0
500
+
501
+ def ndcg_at_k(relevant: set, retrieved: List[str], k: int) -> float:
502
+ dcg = sum(
503
+ 1 / np.log2(i + 2) if doc in relevant else 0
504
+ for i, doc in enumerate(retrieved[:k])
505
+ )
506
+ ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
507
+ return dcg / ideal_dcg if ideal_dcg > 0 else 0
508
+
509
+ metrics = {
510
+ f"precision@{k}": [],
511
+ f"recall@{k}": [],
512
+ "mrr": [],
513
+ f"ndcg@{k}": []
514
+ }
515
+
516
+ for relevant, retrieved in zip(relevant_docs, relevant_docs):
517
+ relevant_set = set(relevant)
518
+ metrics[f"precision@{k}"].append(precision_at_k(relevant_set, retrieved, k))
519
+ metrics[f"recall@{k}"].append(recall_at_k(relevant_set, retrieved, k))
520
+ metrics["mrr"].append(mrr(relevant_set, retrieved))
521
+ metrics[f"ndcg@{k}"].append(ndcg_at_k(relevant_set, retrieved, k))
522
+
523
+ return {name: np.mean(values) for name, values in metrics.items()}
524
+
525
+
526
+ def compute_embedding_similarity(
527
+ embeddings1: np.ndarray,
528
+ embeddings2: np.ndarray,
529
+ metric: str = "cosine"
530
+ ) -> np.ndarray:
531
+ """Compute similarity matrix between embedding sets."""
532
+ if metric == "cosine":
533
+ # Normalize and compute dot product
534
+ norm1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)
535
+ norm2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)
536
+ return norm1 @ norm2.T
537
+ elif metric == "euclidean":
538
+ from scipy.spatial.distance import cdist
539
+ return -cdist(embeddings1, embeddings2, metric='euclidean')
540
+ elif metric == "dot":
541
+ return embeddings1 @ embeddings2.T
542
+ else:
543
+ raise ValueError(f"Unknown metric: {metric}")
544
+
545
+
546
+ def compare_embedding_models(
547
+ texts: List[str],
548
+ models: Dict[str, callable],
549
+ queries: List[str],
550
+ relevant_indices: List[List[int]],
551
+ k: int = 5
552
+ ) -> Dict[str, Dict[str, float]]:
553
+ """Compare multiple embedding models on retrieval quality."""
554
+ results = {}
555
+
556
+ for model_name, embed_fn in models.items():
557
+ # Embed all texts
558
+ doc_embeddings = np.array(embed_fn(texts))
559
+
560
+ retrieved_per_query = []
561
+ for query in queries:
562
+ query_embedding = np.array(embed_fn([query])[0])
563
+ # Compute similarities
564
+ similarities = compute_embedding_similarity(
565
+ query_embedding.reshape(1, -1),
566
+ doc_embeddings,
567
+ metric="cosine"
568
+ )[0]
569
+ # Get top-k indices
570
+ top_k_indices = np.argsort(similarities)[::-1][:k]
571
+ retrieved_per_query.append([str(i) for i in top_k_indices])
572
+
573
+ # Convert relevant indices to string IDs
574
+ relevant_docs = [[str(i) for i in indices] for indices in relevant_indices]
575
+
576
+ results[model_name] = evaluate_retrieval_quality(
577
+ queries, relevant_docs, retrieved_per_query, k
578
+ )
579
+
580
+ return results
581
+ ```
582
+
583
+ ## Best Practices
584
+
585
+ ### Do's
586
+
587
+ - **Match model to use case**: Code vs prose vs multilingual
588
+ - **Chunk thoughtfully**: Preserve semantic boundaries
589
+ - **Normalize embeddings**: For cosine similarity search
590
+ - **Batch requests**: More efficient than one-by-one
591
+ - **Cache embeddings**: Avoid recomputing for static content
592
+ - **Use Voyage AI for Claude apps**: Recommended by Anthropic
593
+
594
+ ### Don'ts
595
+
596
+ - **Don't ignore token limits**: Truncation loses information
597
+ - **Don't mix embedding models**: Incompatible vector spaces
598
+ - **Don't skip preprocessing**: Garbage in, garbage out
599
+ - **Don't over-chunk**: Lose important context
600
+ - **Don't forget metadata**: Essential for filtering and debugging
601
+
602
+ ## Resources
603
+
604
+ - [Voyage AI Documentation](https://docs.voyageai.com/)
605
+ - [OpenAI Embeddings Guide](https://platform.openai.com/docs/guides/embeddings)
606
+ - [Sentence Transformers](https://www.sbert.net/)
607
+ - [MTEB Benchmark](https://huggingface.co/spaces/mteb/leaderboard)
608
+ - [LangChain Embedding Models](https://python.langchain.com/docs/integrations/text_embedding/)