qmdr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +29 -0
- package/.env.example +85 -0
- package/.gitattributes +3 -0
- package/.github/workflows/release.yml +77 -0
- package/AI-SETUP.md +466 -0
- package/LICENSE +22 -0
- package/README.md +78 -0
- package/bun.lock +637 -0
- package/docs/README-zh.md +78 -0
- package/docs/refactor-checklist.md +54 -0
- package/docs/setup-openclaw.md +139 -0
- package/example-index.yml +33 -0
- package/finetune/BALANCED_DISTRIBUTION.md +157 -0
- package/finetune/DATA_IMPROVEMENTS.md +218 -0
- package/finetune/Justfile +43 -0
- package/finetune/Modelfile +16 -0
- package/finetune/README.md +299 -0
- package/finetune/SCORING.md +286 -0
- package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
- package/finetune/configs/grpo.yaml +49 -0
- package/finetune/configs/sft.yaml +42 -0
- package/finetune/configs/sft_local.yaml +40 -0
- package/finetune/convert_gguf.py +221 -0
- package/finetune/data/best_glm_prompt.txt +17 -0
- package/finetune/data/gepa_generated.prompts.json +32 -0
- package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
- package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
- package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
- package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
- package/finetune/data/qmd_expansion_locations.jsonl +64 -0
- package/finetune/data/qmd_expansion_people.jsonl +46 -0
- package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
- package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
- package/finetune/data/qmd_only_sampled.jsonl +399 -0
- package/finetune/dataset/analyze_data.py +369 -0
- package/finetune/dataset/clean_data.py +906 -0
- package/finetune/dataset/generate_balanced.py +823 -0
- package/finetune/dataset/generate_data.py +714 -0
- package/finetune/dataset/generate_data_offline.py +206 -0
- package/finetune/dataset/generate_diverse.py +441 -0
- package/finetune/dataset/generate_ollama.py +326 -0
- package/finetune/dataset/prepare_data.py +197 -0
- package/finetune/dataset/schema.py +73 -0
- package/finetune/dataset/score_data.py +115 -0
- package/finetune/dataset/validate_schema.py +104 -0
- package/finetune/eval.py +196 -0
- package/finetune/evals/queries.txt +56 -0
- package/finetune/gepa/__init__.py +1 -0
- package/finetune/gepa/best_prompt.txt +31 -0
- package/finetune/gepa/best_prompt_glm.txt +1 -0
- package/finetune/gepa/dspy_gepa.py +204 -0
- package/finetune/gepa/example.py +117 -0
- package/finetune/gepa/generate.py +129 -0
- package/finetune/gepa/gepa_outputs.jsonl +10 -0
- package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
- package/finetune/gepa/model.json +19 -0
- package/finetune/gepa/optimizer.py +70 -0
- package/finetune/gepa/score.py +84 -0
- package/finetune/jobs/eval.py +490 -0
- package/finetune/jobs/eval_common.py +354 -0
- package/finetune/jobs/eval_verbose.py +113 -0
- package/finetune/jobs/grpo.py +141 -0
- package/finetune/jobs/quantize.py +244 -0
- package/finetune/jobs/sft.py +121 -0
- package/finetune/pyproject.toml +23 -0
- package/finetune/reward.py +610 -0
- package/finetune/train.py +611 -0
- package/finetune/uv.lock +4070 -0
- package/flake.lock +61 -0
- package/flake.nix +83 -0
- package/migrate-schema.ts +162 -0
- package/package.json +56 -0
- package/skills/qmdr/SKILL.md +172 -0
- package/skills/qmdr/references/mcp-setup.md +88 -0
- package/src/app/commands/collection.ts +55 -0
- package/src/app/commands/context.ts +82 -0
- package/src/app/commands/document.ts +46 -0
- package/src/app/commands/maintenance.ts +60 -0
- package/src/app/commands/search.ts +45 -0
- package/src/app/ports/llm.ts +13 -0
- package/src/app/services/llm-service.ts +145 -0
- package/src/cli.test.ts +963 -0
- package/src/collections.ts +390 -0
- package/src/eval.test.ts +412 -0
- package/src/formatter.ts +427 -0
- package/src/llm.test.ts +559 -0
- package/src/llm.ts +1990 -0
- package/src/mcp.test.ts +889 -0
- package/src/mcp.ts +626 -0
- package/src/qmd.ts +3330 -0
- package/src/store/collections.ts +7 -0
- package/src/store/context.ts +10 -0
- package/src/store/db.ts +5 -0
- package/src/store/documents.ts +26 -0
- package/src/store/maintenance.ts +15 -0
- package/src/store/path.ts +13 -0
- package/src/store/search.ts +10 -0
- package/src/store-paths.test.ts +395 -0
- package/src/store.test.ts +2483 -0
- package/src/store.ts +2813 -0
- package/test/eval-harness.ts +223 -0
- package/tsconfig.json +29 -0
|
@@ -0,0 +1,714 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate synthetic training data for QMD query expansion using Claude API."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from dataset.schema import normalize_output_items, parse_output_text
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import anthropic
|
|
14
|
+
except ImportError:
|
|
15
|
+
print("Install anthropic: pip install anthropic")
|
|
16
|
+
exit(1)
|
|
17
|
+
|
|
18
|
+
# Sample query templates for diverse training data - organized by category
|
|
19
|
+
QUERY_TEMPLATES = [
|
|
20
|
+
# === Technical documentation (35% of queries) ===
|
|
21
|
+
"how to {action} {technology}",
|
|
22
|
+
"{technology} {concept} example",
|
|
23
|
+
"configure {technology} for {use_case}",
|
|
24
|
+
"{error_type} error in {technology}",
|
|
25
|
+
"best practices for {concept}",
|
|
26
|
+
"{technology} vs {technology2}",
|
|
27
|
+
"{action} {technology} {use_case}",
|
|
28
|
+
"setup {technology} {use_case}",
|
|
29
|
+
"{technology} tutorial for beginners",
|
|
30
|
+
"{technology} documentation",
|
|
31
|
+
"{technology} {error_type} troubleshooting",
|
|
32
|
+
"{concept} in {technology}",
|
|
33
|
+
"migrate from {technology} to {technology2}",
|
|
34
|
+
"{action} {concept} {technology}",
|
|
35
|
+
# === Personal notes / journals (15% of queries) ===
|
|
36
|
+
"meeting notes {topic}",
|
|
37
|
+
"ideas for {project}",
|
|
38
|
+
"{date} journal entry",
|
|
39
|
+
"thoughts on {topic}",
|
|
40
|
+
"{project} {topic} notes",
|
|
41
|
+
"{topic} meeting {date}",
|
|
42
|
+
"reflect on {topic}",
|
|
43
|
+
"brainstorm {project}",
|
|
44
|
+
# === Research / learning (20% of queries) ===
|
|
45
|
+
"what is {concept}",
|
|
46
|
+
"difference between {thing1} and {thing2}",
|
|
47
|
+
"{topic} tutorial",
|
|
48
|
+
"learn {skill}",
|
|
49
|
+
"understand {concept}",
|
|
50
|
+
"explain {concept}",
|
|
51
|
+
"{topic} fundamentals",
|
|
52
|
+
"intro to {skill}",
|
|
53
|
+
"{thing1} or {thing2}",
|
|
54
|
+
"when to use {concept}",
|
|
55
|
+
# === Short / keyword queries (15% of queries) ===
|
|
56
|
+
"{keyword}",
|
|
57
|
+
"{keyword} {modifier}",
|
|
58
|
+
"{keyword} {action}",
|
|
59
|
+
"{keyword} {use_case}",
|
|
60
|
+
"{technology} {keyword}",
|
|
61
|
+
"{concept} {keyword}",
|
|
62
|
+
# === Temporal / recency queries (10% of queries) ===
|
|
63
|
+
"latest {topic}",
|
|
64
|
+
"recent {concept} changes",
|
|
65
|
+
"new {technology} features",
|
|
66
|
+
"{topic} update {date}",
|
|
67
|
+
"what changed in {technology}",
|
|
68
|
+
"{technology} changelog {date}",
|
|
69
|
+
"{topic} news {date}",
|
|
70
|
+
# === Named entities / specific topics (5% of queries) ===
|
|
71
|
+
"{named_entity} {topic}",
|
|
72
|
+
"{person} {concept}",
|
|
73
|
+
"{organization} {use_case}",
|
|
74
|
+
"{product} {action}",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
# Category weights for balanced sampling
|
|
78
|
+
TEMPLATE_CATEGORIES = {
|
|
79
|
+
"technical": list(range(0, 14)), # 0-13
|
|
80
|
+
"personal": list(range(14, 22)), # 14-21
|
|
81
|
+
"research": list(range(22, 31)), # 22-30
|
|
82
|
+
"short": list(range(31, 36)), # 31-35
|
|
83
|
+
"temporal": list(range(36, 42)), # 36-41
|
|
84
|
+
"entities": list(range(42, 46)), # 42-45
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
ACTIONS = [
|
|
88
|
+
"install",
|
|
89
|
+
"configure",
|
|
90
|
+
"setup",
|
|
91
|
+
"debug",
|
|
92
|
+
"deploy",
|
|
93
|
+
"test",
|
|
94
|
+
"optimize",
|
|
95
|
+
"migrate",
|
|
96
|
+
"build",
|
|
97
|
+
"run",
|
|
98
|
+
"lint",
|
|
99
|
+
"format",
|
|
100
|
+
"backup",
|
|
101
|
+
"restore",
|
|
102
|
+
"update",
|
|
103
|
+
"rollback",
|
|
104
|
+
"monitor",
|
|
105
|
+
"scale",
|
|
106
|
+
"secure",
|
|
107
|
+
"integrate",
|
|
108
|
+
"automate",
|
|
109
|
+
"refactor",
|
|
110
|
+
"initialize",
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
TECHNOLOGIES = [
|
|
114
|
+
# Languages
|
|
115
|
+
"python",
|
|
116
|
+
"typescript",
|
|
117
|
+
"javascript",
|
|
118
|
+
"rust",
|
|
119
|
+
"golang",
|
|
120
|
+
"java",
|
|
121
|
+
"kotlin",
|
|
122
|
+
"swift",
|
|
123
|
+
"ruby",
|
|
124
|
+
"php",
|
|
125
|
+
"cpp",
|
|
126
|
+
"c",
|
|
127
|
+
"elixir",
|
|
128
|
+
"scala",
|
|
129
|
+
"clojure",
|
|
130
|
+
"dart",
|
|
131
|
+
# Frameworks/Frontend
|
|
132
|
+
"react",
|
|
133
|
+
"vue",
|
|
134
|
+
"angular",
|
|
135
|
+
"svelte",
|
|
136
|
+
"solid",
|
|
137
|
+
"htmx",
|
|
138
|
+
"alpine",
|
|
139
|
+
"nextjs",
|
|
140
|
+
"nuxt",
|
|
141
|
+
# Backend
|
|
142
|
+
"django",
|
|
143
|
+
"flask",
|
|
144
|
+
"fastapi",
|
|
145
|
+
"express",
|
|
146
|
+
"rails",
|
|
147
|
+
"spring",
|
|
148
|
+
"laravel",
|
|
149
|
+
# Infrastructure
|
|
150
|
+
"docker",
|
|
151
|
+
"kubernetes",
|
|
152
|
+
"terraform",
|
|
153
|
+
"ansible",
|
|
154
|
+
"jenkins",
|
|
155
|
+
"github-actions",
|
|
156
|
+
# Databases
|
|
157
|
+
"postgres",
|
|
158
|
+
"mysql",
|
|
159
|
+
"mongodb",
|
|
160
|
+
"redis",
|
|
161
|
+
"elasticsearch",
|
|
162
|
+
"sqlite",
|
|
163
|
+
"dynamodb",
|
|
164
|
+
"cassandra",
|
|
165
|
+
"cockroachdb",
|
|
166
|
+
"supabase",
|
|
167
|
+
"firebase",
|
|
168
|
+
# Tools
|
|
169
|
+
"git",
|
|
170
|
+
"nginx",
|
|
171
|
+
"apache",
|
|
172
|
+
"linux",
|
|
173
|
+
"aws",
|
|
174
|
+
"gcp",
|
|
175
|
+
"azure",
|
|
176
|
+
"vercel",
|
|
177
|
+
"netlify",
|
|
178
|
+
# Data/ML
|
|
179
|
+
"pandas",
|
|
180
|
+
"numpy",
|
|
181
|
+
"tensorflow",
|
|
182
|
+
"pytorch",
|
|
183
|
+
"scikit-learn",
|
|
184
|
+
"jupyter",
|
|
185
|
+
"spark",
|
|
186
|
+
"kafka",
|
|
187
|
+
"airflow",
|
|
188
|
+
"dbt",
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
TECHNOLOGIES_2 = [
|
|
192
|
+
"docker",
|
|
193
|
+
"kubernetes",
|
|
194
|
+
"postgres",
|
|
195
|
+
"mysql",
|
|
196
|
+
"redis",
|
|
197
|
+
"mongodb",
|
|
198
|
+
"aws",
|
|
199
|
+
"gcp",
|
|
200
|
+
"react",
|
|
201
|
+
"vue",
|
|
202
|
+
"angular",
|
|
203
|
+
"python",
|
|
204
|
+
"javascript",
|
|
205
|
+
"typescript",
|
|
206
|
+
"github-actions",
|
|
207
|
+
"gitlab-ci",
|
|
208
|
+
"jenkins",
|
|
209
|
+
"terraform",
|
|
210
|
+
"ansible",
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
CONCEPTS = [
|
|
214
|
+
"authentication",
|
|
215
|
+
"caching",
|
|
216
|
+
"logging",
|
|
217
|
+
"testing",
|
|
218
|
+
"deployment",
|
|
219
|
+
"API",
|
|
220
|
+
"database",
|
|
221
|
+
"security",
|
|
222
|
+
"monitoring",
|
|
223
|
+
"performance",
|
|
224
|
+
"scalability",
|
|
225
|
+
"reliability",
|
|
226
|
+
"observability",
|
|
227
|
+
"microservices",
|
|
228
|
+
"serverless",
|
|
229
|
+
"virtualization",
|
|
230
|
+
"containerization",
|
|
231
|
+
"orchestration",
|
|
232
|
+
"CI/CD",
|
|
233
|
+
"version control",
|
|
234
|
+
"dependency injection",
|
|
235
|
+
"event sourcing",
|
|
236
|
+
"CQRS",
|
|
237
|
+
"load balancing",
|
|
238
|
+
"rate limiting",
|
|
239
|
+
"circuit breaker",
|
|
240
|
+
"retry logic",
|
|
241
|
+
"idempotency",
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
USE_CASES = [
|
|
245
|
+
"production",
|
|
246
|
+
"development",
|
|
247
|
+
"CI/CD",
|
|
248
|
+
"local",
|
|
249
|
+
"cloud",
|
|
250
|
+
"staging",
|
|
251
|
+
"testing",
|
|
252
|
+
"microservices",
|
|
253
|
+
"serverless",
|
|
254
|
+
"hybrid",
|
|
255
|
+
"multi-tenant",
|
|
256
|
+
"high-availability",
|
|
257
|
+
"real-time",
|
|
258
|
+
"batch processing",
|
|
259
|
+
"stream processing",
|
|
260
|
+
"data pipeline",
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
ERROR_TYPES = [
|
|
264
|
+
"connection",
|
|
265
|
+
"timeout",
|
|
266
|
+
"permission",
|
|
267
|
+
"memory",
|
|
268
|
+
"syntax",
|
|
269
|
+
"runtime",
|
|
270
|
+
"configuration",
|
|
271
|
+
"dependency",
|
|
272
|
+
"network",
|
|
273
|
+
"authentication",
|
|
274
|
+
"authorization",
|
|
275
|
+
"validation",
|
|
276
|
+
"concurrency",
|
|
277
|
+
"deadlock",
|
|
278
|
+
"resource",
|
|
279
|
+
"quota",
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
TOPICS = [
|
|
283
|
+
"productivity",
|
|
284
|
+
"workflow",
|
|
285
|
+
"architecture",
|
|
286
|
+
"design",
|
|
287
|
+
"performance",
|
|
288
|
+
"security",
|
|
289
|
+
"scalability",
|
|
290
|
+
"reliability",
|
|
291
|
+
"observability",
|
|
292
|
+
"maintainability",
|
|
293
|
+
"testing",
|
|
294
|
+
"documentation",
|
|
295
|
+
"refactoring",
|
|
296
|
+
"debugging",
|
|
297
|
+
"optimization",
|
|
298
|
+
"best practices",
|
|
299
|
+
"patterns",
|
|
300
|
+
"anti-patterns",
|
|
301
|
+
"trade-offs",
|
|
302
|
+
"decision making",
|
|
303
|
+
]
|
|
304
|
+
|
|
305
|
+
KEYWORDS = [
|
|
306
|
+
"auth",
|
|
307
|
+
"config",
|
|
308
|
+
"setup",
|
|
309
|
+
"api",
|
|
310
|
+
"cache",
|
|
311
|
+
"log",
|
|
312
|
+
"test",
|
|
313
|
+
"debug",
|
|
314
|
+
"env",
|
|
315
|
+
"vars",
|
|
316
|
+
"secrets",
|
|
317
|
+
"tokens",
|
|
318
|
+
"headers",
|
|
319
|
+
"params",
|
|
320
|
+
"query",
|
|
321
|
+
"body",
|
|
322
|
+
"route",
|
|
323
|
+
"middleware",
|
|
324
|
+
"handler",
|
|
325
|
+
"controller",
|
|
326
|
+
"model",
|
|
327
|
+
"view",
|
|
328
|
+
"template",
|
|
329
|
+
"migration",
|
|
330
|
+
"seed",
|
|
331
|
+
"fixture",
|
|
332
|
+
"mock",
|
|
333
|
+
"stub",
|
|
334
|
+
"spy",
|
|
335
|
+
"fake",
|
|
336
|
+
"build",
|
|
337
|
+
"bundle",
|
|
338
|
+
"compile",
|
|
339
|
+
"transpile",
|
|
340
|
+
"minify",
|
|
341
|
+
"optimize",
|
|
342
|
+
"deploy",
|
|
343
|
+
"release",
|
|
344
|
+
"rollback",
|
|
345
|
+
"promote",
|
|
346
|
+
"freeze",
|
|
347
|
+
"thaw",
|
|
348
|
+
"pull",
|
|
349
|
+
"push",
|
|
350
|
+
"commit",
|
|
351
|
+
"merge",
|
|
352
|
+
"rebase",
|
|
353
|
+
"cherry-pick",
|
|
354
|
+
"stash",
|
|
355
|
+
"up",
|
|
356
|
+
"down",
|
|
357
|
+
"scale",
|
|
358
|
+
"restart",
|
|
359
|
+
"reload",
|
|
360
|
+
"refresh",
|
|
361
|
+
"flush",
|
|
362
|
+
"cron",
|
|
363
|
+
"queue",
|
|
364
|
+
"job",
|
|
365
|
+
"worker",
|
|
366
|
+
"scheduler",
|
|
367
|
+
"trigger",
|
|
368
|
+
"webhook",
|
|
369
|
+
"alert",
|
|
370
|
+
"metric",
|
|
371
|
+
"trace",
|
|
372
|
+
"span",
|
|
373
|
+
"event",
|
|
374
|
+
"incident",
|
|
375
|
+
"oncall",
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
MODIFIERS = [
|
|
379
|
+
"best",
|
|
380
|
+
"fast",
|
|
381
|
+
"simple",
|
|
382
|
+
"advanced",
|
|
383
|
+
"secure",
|
|
384
|
+
"quick",
|
|
385
|
+
"easy",
|
|
386
|
+
"proper",
|
|
387
|
+
"correct",
|
|
388
|
+
"safe",
|
|
389
|
+
"efficient",
|
|
390
|
+
"reliable",
|
|
391
|
+
"robust",
|
|
392
|
+
"latest",
|
|
393
|
+
"recent",
|
|
394
|
+
"new",
|
|
395
|
+
"old",
|
|
396
|
+
"legacy",
|
|
397
|
+
"modern",
|
|
398
|
+
"local",
|
|
399
|
+
"remote",
|
|
400
|
+
"global",
|
|
401
|
+
"shared",
|
|
402
|
+
"private",
|
|
403
|
+
"public",
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
NAMED_ENTITIES = [
|
|
407
|
+
"React",
|
|
408
|
+
"Vue",
|
|
409
|
+
"Angular",
|
|
410
|
+
"Docker",
|
|
411
|
+
"Kubernetes",
|
|
412
|
+
"AWS",
|
|
413
|
+
"GCP",
|
|
414
|
+
"GitHub",
|
|
415
|
+
"GitLab",
|
|
416
|
+
"Vercel",
|
|
417
|
+
"Netlify",
|
|
418
|
+
"Supabase",
|
|
419
|
+
"Firebase",
|
|
420
|
+
"Stripe",
|
|
421
|
+
"Twilio",
|
|
422
|
+
"SendGrid",
|
|
423
|
+
"Datadog",
|
|
424
|
+
"PagerDuty",
|
|
425
|
+
"Sentry",
|
|
426
|
+
"Terraform",
|
|
427
|
+
"Ansible",
|
|
428
|
+
"Jenkins",
|
|
429
|
+
"CircleCI",
|
|
430
|
+
"TravisCI",
|
|
431
|
+
]
|
|
432
|
+
|
|
433
|
+
PERSONS = [
|
|
434
|
+
"Kent Beck",
|
|
435
|
+
"Martin Fowler",
|
|
436
|
+
"Robert Martin",
|
|
437
|
+
"Dave Thomas",
|
|
438
|
+
"Guido van Rossum",
|
|
439
|
+
"Brendan Eich",
|
|
440
|
+
"Ryan Dahl",
|
|
441
|
+
"Anders Hejlsberg",
|
|
442
|
+
"Linus Torvalds",
|
|
443
|
+
"DHH",
|
|
444
|
+
"Yukihiro Matsumoto",
|
|
445
|
+
"Rich Hickey",
|
|
446
|
+
]
|
|
447
|
+
|
|
448
|
+
ORGANIZATIONS = [
|
|
449
|
+
"Google",
|
|
450
|
+
"Microsoft",
|
|
451
|
+
"Amazon",
|
|
452
|
+
"Meta",
|
|
453
|
+
"Apple",
|
|
454
|
+
"Netflix",
|
|
455
|
+
"Spotify",
|
|
456
|
+
"Stripe",
|
|
457
|
+
"Shopify",
|
|
458
|
+
"Airbnb",
|
|
459
|
+
"Uber",
|
|
460
|
+
"Lyft",
|
|
461
|
+
"Slack",
|
|
462
|
+
"Discord",
|
|
463
|
+
]
|
|
464
|
+
|
|
465
|
+
PRODUCTS = [
|
|
466
|
+
"VS Code",
|
|
467
|
+
"IntelliJ",
|
|
468
|
+
"PyCharm",
|
|
469
|
+
"WebStorm",
|
|
470
|
+
"DataGrip",
|
|
471
|
+
"Postman",
|
|
472
|
+
"Insomnia",
|
|
473
|
+
"TablePlus",
|
|
474
|
+
"Docker Desktop",
|
|
475
|
+
"Lens",
|
|
476
|
+
"Figma",
|
|
477
|
+
"Sketch",
|
|
478
|
+
"Notion",
|
|
479
|
+
"Linear",
|
|
480
|
+
"Jira",
|
|
481
|
+
"Trello",
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD.
|
|
485
|
+
|
|
486
|
+
Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types:
|
|
487
|
+
|
|
488
|
+
1. **lex** lines: Keyword variations optimized for BM25 full-text search
|
|
489
|
+
- Short, keyword-focused
|
|
490
|
+
- Good for exact term matching
|
|
491
|
+
- 1-3 lines
|
|
492
|
+
|
|
493
|
+
2. **vec** lines: Semantic reformulations for vector/embedding search
|
|
494
|
+
- Complete phrases or questions
|
|
495
|
+
- Capture semantic meaning
|
|
496
|
+
- 1-3 lines
|
|
497
|
+
|
|
498
|
+
3. **hyde** line: A hypothetical document passage (HyDE technique)
|
|
499
|
+
- A realistic passage that would answer the query
|
|
500
|
+
- Contains domain-specific terminology
|
|
501
|
+
- Written as if it's FROM a document, not ABOUT the query
|
|
502
|
+
- MAX 1 line
|
|
503
|
+
|
|
504
|
+
Output format (STRICT - follow exactly):
|
|
505
|
+
```
|
|
506
|
+
hyde: A passage that would appear in a document answering this query.
|
|
507
|
+
lex: keyword1
|
|
508
|
+
lex: keyword2
|
|
509
|
+
vec: semantic query reformulation
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
Rules:
|
|
513
|
+
- Each line must start with "lex:", "vec:", or "hyde:"
|
|
514
|
+
- No blank lines
|
|
515
|
+
- No repetition between lines
|
|
516
|
+
- hyde should be a realistic document excerpt, not a question
|
|
517
|
+
- Stay focused on the original query intent"""
|
|
518
|
+
|
|
519
|
+
USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query:
|
|
520
|
+
|
|
521
|
+
Query: {query}
|
|
522
|
+
|
|
523
|
+
Respond with ONLY the lex/vec/hyde lines, nothing else."""
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
# Category weights - BALANCED approach
|
|
527
|
+
# Tech at 15% (reasonable for QMD's technical document use case)
|
|
528
|
+
CATEGORY_WEIGHTS = {
|
|
529
|
+
"technical": 0.15, # 15% - Technical documentation
|
|
530
|
+
"personal": 0.10, # 10% - Personal notes, journals
|
|
531
|
+
"research": 0.10, # 10% - Research and learning
|
|
532
|
+
"short": 0.15, # 15% - Short keyword queries
|
|
533
|
+
"temporal": 0.10, # 10% - Temporal/recency queries (2025/2026)
|
|
534
|
+
"entities": 0.05, # 5% - Named entity queries
|
|
535
|
+
"health": 0.10, # 10% - Health & wellness
|
|
536
|
+
"finance": 0.10, # 10% - Finance & business
|
|
537
|
+
"lifestyle": 0.10, # 10% - Home, food, hobbies, travel
|
|
538
|
+
"education": 0.05, # 5% - Education & arts
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def generate_random_query() -> str:
|
|
543
|
+
"""Generate a random query from templates with category-weighted sampling."""
|
|
544
|
+
# Select category based on weights
|
|
545
|
+
categories = list(CATEGORY_WEIGHTS.keys())
|
|
546
|
+
weights = list(CATEGORY_WEIGHTS.values())
|
|
547
|
+
selected_category = random.choices(categories, weights=weights, k=1)[0]
|
|
548
|
+
|
|
549
|
+
# Select template from that category
|
|
550
|
+
template_idx = random.choice(TEMPLATE_CATEGORIES[selected_category])
|
|
551
|
+
template = QUERY_TEMPLATES[template_idx]
|
|
552
|
+
|
|
553
|
+
# Build replacements based on template type
|
|
554
|
+
replacements = {
|
|
555
|
+
"{action}": random.choice(ACTIONS),
|
|
556
|
+
"{technology}": random.choice(TECHNOLOGIES),
|
|
557
|
+
"{technology2}": random.choice(TECHNOLOGIES_2),
|
|
558
|
+
"{concept}": random.choice(CONCEPTS),
|
|
559
|
+
"{use_case}": random.choice(USE_CASES),
|
|
560
|
+
"{error_type}": random.choice(ERROR_TYPES),
|
|
561
|
+
"{topic}": random.choice(TOPICS),
|
|
562
|
+
"{project}": random.choice(
|
|
563
|
+
["website", "app", "CLI tool", "API", "library", "service", "platform"]
|
|
564
|
+
),
|
|
565
|
+
"{date}": random.choice(
|
|
566
|
+
# Emphasize 2025/2026 for recency queries (current era)
|
|
567
|
+
[
|
|
568
|
+
"2026",
|
|
569
|
+
"2026",
|
|
570
|
+
"2025",
|
|
571
|
+
"2025",
|
|
572
|
+
"January 2026",
|
|
573
|
+
"February 2026",
|
|
574
|
+
"March 2026",
|
|
575
|
+
"last month",
|
|
576
|
+
"this week",
|
|
577
|
+
"yesterday",
|
|
578
|
+
"today",
|
|
579
|
+
"recently",
|
|
580
|
+
"latest",
|
|
581
|
+
]
|
|
582
|
+
),
|
|
583
|
+
"{thing1}": random.choice(CONCEPTS[:10]),
|
|
584
|
+
"{thing2}": random.choice(CONCEPTS[10:] if len(CONCEPTS) > 10 else CONCEPTS),
|
|
585
|
+
"{skill}": random.choice(TECHNOLOGIES),
|
|
586
|
+
"{keyword}": random.choice(KEYWORDS),
|
|
587
|
+
"{modifier}": random.choice(MODIFIERS),
|
|
588
|
+
"{named_entity}": random.choice(NAMED_ENTITIES),
|
|
589
|
+
"{person}": random.choice(PERSONS),
|
|
590
|
+
"{organization}": random.choice(ORGANIZATIONS),
|
|
591
|
+
"{product}": random.choice(PRODUCTS),
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
query = template
|
|
595
|
+
for key, value in replacements.items():
|
|
596
|
+
query = query.replace(key, value)
|
|
597
|
+
|
|
598
|
+
return query
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None:
|
|
602
|
+
"""Generate expansion using Claude API."""
|
|
603
|
+
try:
|
|
604
|
+
response = client.messages.create(
|
|
605
|
+
model="claude-sonnet-4-20250514",
|
|
606
|
+
max_tokens=300,
|
|
607
|
+
system=SYSTEM_PROMPT,
|
|
608
|
+
messages=[
|
|
609
|
+
{"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)}
|
|
610
|
+
],
|
|
611
|
+
)
|
|
612
|
+
return response.content[0].text.strip()
|
|
613
|
+
except Exception as e:
|
|
614
|
+
print(f"Error generating expansion for '{query}': {e}")
|
|
615
|
+
return None
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def validate_output(output: str) -> bool:
|
|
619
|
+
"""Validate that output follows the expected format."""
|
|
620
|
+
lines = output.strip().split("\n")
|
|
621
|
+
if not lines:
|
|
622
|
+
return False
|
|
623
|
+
|
|
624
|
+
has_lex = False
|
|
625
|
+
has_vec = False
|
|
626
|
+
|
|
627
|
+
for line in lines:
|
|
628
|
+
line = line.strip()
|
|
629
|
+
if not line:
|
|
630
|
+
continue
|
|
631
|
+
if line.startswith("lex:"):
|
|
632
|
+
has_lex = True
|
|
633
|
+
elif line.startswith("vec:"):
|
|
634
|
+
has_vec = True
|
|
635
|
+
elif line.startswith("hyde:"):
|
|
636
|
+
pass
|
|
637
|
+
else:
|
|
638
|
+
return False # Invalid line type
|
|
639
|
+
|
|
640
|
+
return has_lex and has_vec
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def main():
|
|
644
|
+
parser = argparse.ArgumentParser(
|
|
645
|
+
description="Generate QMD query expansion training data"
|
|
646
|
+
)
|
|
647
|
+
parser.add_argument(
|
|
648
|
+
"--count", type=int, default=100, help="Number of examples to generate"
|
|
649
|
+
)
|
|
650
|
+
parser.add_argument(
|
|
651
|
+
"--output",
|
|
652
|
+
type=str,
|
|
653
|
+
default="data/qmd_expansion.jsonl",
|
|
654
|
+
help="Output file path",
|
|
655
|
+
)
|
|
656
|
+
parser.add_argument(
|
|
657
|
+
"--queries", type=str, help="Optional file with custom queries (one per line)"
|
|
658
|
+
)
|
|
659
|
+
args = parser.parse_args()
|
|
660
|
+
|
|
661
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
662
|
+
if not api_key:
|
|
663
|
+
print("Error: ANTHROPIC_API_KEY environment variable not set")
|
|
664
|
+
exit(1)
|
|
665
|
+
|
|
666
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
667
|
+
output_path = Path(args.output)
|
|
668
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
669
|
+
|
|
670
|
+
# Load custom queries if provided
|
|
671
|
+
custom_queries = []
|
|
672
|
+
if args.queries and Path(args.queries).exists():
|
|
673
|
+
custom_queries = Path(args.queries).read_text().strip().split("\n")
|
|
674
|
+
print(f"Loaded {len(custom_queries)} custom queries")
|
|
675
|
+
|
|
676
|
+
examples = []
|
|
677
|
+
seen_queries = set()
|
|
678
|
+
|
|
679
|
+
print(f"Generating {args.count} examples...")
|
|
680
|
+
|
|
681
|
+
i = 0
|
|
682
|
+
while len(examples) < args.count:
|
|
683
|
+
# Use custom query or generate random one
|
|
684
|
+
if custom_queries and i < len(custom_queries):
|
|
685
|
+
query = custom_queries[i].strip()
|
|
686
|
+
else:
|
|
687
|
+
query = generate_random_query()
|
|
688
|
+
|
|
689
|
+
i += 1
|
|
690
|
+
|
|
691
|
+
# Skip duplicates
|
|
692
|
+
if query in seen_queries:
|
|
693
|
+
continue
|
|
694
|
+
seen_queries.add(query)
|
|
695
|
+
|
|
696
|
+
# Generate expansion
|
|
697
|
+
output = generate_expansion(client, query)
|
|
698
|
+
if output and validate_output(output):
|
|
699
|
+
output_items = normalize_output_items(parse_output_text(output))
|
|
700
|
+
examples.append({"query": query, "output": output_items})
|
|
701
|
+
print(f"[{len(examples)}/{args.count}] {query[:50]}...")
|
|
702
|
+
else:
|
|
703
|
+
print(f" Skipped invalid output for: {query[:50]}...")
|
|
704
|
+
|
|
705
|
+
# Write output
|
|
706
|
+
with open(output_path, "w") as f:
|
|
707
|
+
for example in examples:
|
|
708
|
+
f.write(json.dumps(example) + "\n")
|
|
709
|
+
|
|
710
|
+
print(f"\nGenerated {len(examples)} examples to {output_path}")
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
if __name__ == "__main__":
|
|
714
|
+
main()
|