qmdr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/.claude-plugin/marketplace.json +29 -0
  2. package/.env.example +85 -0
  3. package/.gitattributes +3 -0
  4. package/.github/workflows/release.yml +77 -0
  5. package/AI-SETUP.md +466 -0
  6. package/LICENSE +22 -0
  7. package/README.md +78 -0
  8. package/bun.lock +637 -0
  9. package/docs/README-zh.md +78 -0
  10. package/docs/refactor-checklist.md +54 -0
  11. package/docs/setup-openclaw.md +139 -0
  12. package/example-index.yml +33 -0
  13. package/finetune/BALANCED_DISTRIBUTION.md +157 -0
  14. package/finetune/DATA_IMPROVEMENTS.md +218 -0
  15. package/finetune/Justfile +43 -0
  16. package/finetune/Modelfile +16 -0
  17. package/finetune/README.md +299 -0
  18. package/finetune/SCORING.md +286 -0
  19. package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
  20. package/finetune/configs/grpo.yaml +49 -0
  21. package/finetune/configs/sft.yaml +42 -0
  22. package/finetune/configs/sft_local.yaml +40 -0
  23. package/finetune/convert_gguf.py +221 -0
  24. package/finetune/data/best_glm_prompt.txt +17 -0
  25. package/finetune/data/gepa_generated.prompts.json +32 -0
  26. package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
  27. package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
  28. package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
  29. package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
  30. package/finetune/data/qmd_expansion_locations.jsonl +64 -0
  31. package/finetune/data/qmd_expansion_people.jsonl +46 -0
  32. package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
  33. package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
  34. package/finetune/data/qmd_only_sampled.jsonl +399 -0
  35. package/finetune/dataset/analyze_data.py +369 -0
  36. package/finetune/dataset/clean_data.py +906 -0
  37. package/finetune/dataset/generate_balanced.py +823 -0
  38. package/finetune/dataset/generate_data.py +714 -0
  39. package/finetune/dataset/generate_data_offline.py +206 -0
  40. package/finetune/dataset/generate_diverse.py +441 -0
  41. package/finetune/dataset/generate_ollama.py +326 -0
  42. package/finetune/dataset/prepare_data.py +197 -0
  43. package/finetune/dataset/schema.py +73 -0
  44. package/finetune/dataset/score_data.py +115 -0
  45. package/finetune/dataset/validate_schema.py +104 -0
  46. package/finetune/eval.py +196 -0
  47. package/finetune/evals/queries.txt +56 -0
  48. package/finetune/gepa/__init__.py +1 -0
  49. package/finetune/gepa/best_prompt.txt +31 -0
  50. package/finetune/gepa/best_prompt_glm.txt +1 -0
  51. package/finetune/gepa/dspy_gepa.py +204 -0
  52. package/finetune/gepa/example.py +117 -0
  53. package/finetune/gepa/generate.py +129 -0
  54. package/finetune/gepa/gepa_outputs.jsonl +10 -0
  55. package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
  56. package/finetune/gepa/model.json +19 -0
  57. package/finetune/gepa/optimizer.py +70 -0
  58. package/finetune/gepa/score.py +84 -0
  59. package/finetune/jobs/eval.py +490 -0
  60. package/finetune/jobs/eval_common.py +354 -0
  61. package/finetune/jobs/eval_verbose.py +113 -0
  62. package/finetune/jobs/grpo.py +141 -0
  63. package/finetune/jobs/quantize.py +244 -0
  64. package/finetune/jobs/sft.py +121 -0
  65. package/finetune/pyproject.toml +23 -0
  66. package/finetune/reward.py +610 -0
  67. package/finetune/train.py +611 -0
  68. package/finetune/uv.lock +4070 -0
  69. package/flake.lock +61 -0
  70. package/flake.nix +83 -0
  71. package/migrate-schema.ts +162 -0
  72. package/package.json +56 -0
  73. package/skills/qmdr/SKILL.md +172 -0
  74. package/skills/qmdr/references/mcp-setup.md +88 -0
  75. package/src/app/commands/collection.ts +55 -0
  76. package/src/app/commands/context.ts +82 -0
  77. package/src/app/commands/document.ts +46 -0
  78. package/src/app/commands/maintenance.ts +60 -0
  79. package/src/app/commands/search.ts +45 -0
  80. package/src/app/ports/llm.ts +13 -0
  81. package/src/app/services/llm-service.ts +145 -0
  82. package/src/cli.test.ts +963 -0
  83. package/src/collections.ts +390 -0
  84. package/src/eval.test.ts +412 -0
  85. package/src/formatter.ts +427 -0
  86. package/src/llm.test.ts +559 -0
  87. package/src/llm.ts +1990 -0
  88. package/src/mcp.test.ts +889 -0
  89. package/src/mcp.ts +626 -0
  90. package/src/qmd.ts +3330 -0
  91. package/src/store/collections.ts +7 -0
  92. package/src/store/context.ts +10 -0
  93. package/src/store/db.ts +5 -0
  94. package/src/store/documents.ts +26 -0
  95. package/src/store/maintenance.ts +15 -0
  96. package/src/store/path.ts +13 -0
  97. package/src/store/search.ts +10 -0
  98. package/src/store-paths.test.ts +395 -0
  99. package/src/store.test.ts +2483 -0
  100. package/src/store.ts +2813 -0
  101. package/test/eval-harness.ts +223 -0
  102. package/tsconfig.json +29 -0
@@ -0,0 +1,714 @@
1
+ #!/usr/bin/env python3
2
+ """Generate synthetic training data for QMD query expansion using Claude API."""
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ import random
8
+ from pathlib import Path
9
+
10
+ from dataset.schema import normalize_output_items, parse_output_text
11
+
12
+ try:
13
+ import anthropic
14
+ except ImportError:
15
+ print("Install anthropic: pip install anthropic")
16
+ exit(1)
17
+
18
+ # Sample query templates for diverse training data - organized by category
19
+ QUERY_TEMPLATES = [
20
+ # === Technical documentation (35% of queries) ===
21
+ "how to {action} {technology}",
22
+ "{technology} {concept} example",
23
+ "configure {technology} for {use_case}",
24
+ "{error_type} error in {technology}",
25
+ "best practices for {concept}",
26
+ "{technology} vs {technology2}",
27
+ "{action} {technology} {use_case}",
28
+ "setup {technology} {use_case}",
29
+ "{technology} tutorial for beginners",
30
+ "{technology} documentation",
31
+ "{technology} {error_type} troubleshooting",
32
+ "{concept} in {technology}",
33
+ "migrate from {technology} to {technology2}",
34
+ "{action} {concept} {technology}",
35
+ # === Personal notes / journals (15% of queries) ===
36
+ "meeting notes {topic}",
37
+ "ideas for {project}",
38
+ "{date} journal entry",
39
+ "thoughts on {topic}",
40
+ "{project} {topic} notes",
41
+ "{topic} meeting {date}",
42
+ "reflect on {topic}",
43
+ "brainstorm {project}",
44
+ # === Research / learning (20% of queries) ===
45
+ "what is {concept}",
46
+ "difference between {thing1} and {thing2}",
47
+ "{topic} tutorial",
48
+ "learn {skill}",
49
+ "understand {concept}",
50
+ "explain {concept}",
51
+ "{topic} fundamentals",
52
+ "intro to {skill}",
53
+ "{thing1} or {thing2}",
54
+ "when to use {concept}",
55
+ # === Short / keyword queries (15% of queries) ===
56
+ "{keyword}",
57
+ "{keyword} {modifier}",
58
+ "{keyword} {action}",
59
+ "{keyword} {use_case}",
60
+ "{technology} {keyword}",
61
+ "{concept} {keyword}",
62
+ # === Temporal / recency queries (10% of queries) ===
63
+ "latest {topic}",
64
+ "recent {concept} changes",
65
+ "new {technology} features",
66
+ "{topic} update {date}",
67
+ "what changed in {technology}",
68
+ "{technology} changelog {date}",
69
+ "{topic} news {date}",
70
+ # === Named entities / specific topics (5% of queries) ===
71
+ "{named_entity} {topic}",
72
+ "{person} {concept}",
73
+ "{organization} {use_case}",
74
+ "{product} {action}",
75
+ ]
76
+
77
+ # Category weights for balanced sampling
78
+ TEMPLATE_CATEGORIES = {
79
+ "technical": list(range(0, 14)), # 0-13
80
+ "personal": list(range(14, 22)), # 14-21
81
+ "research": list(range(22, 31)), # 22-30
82
+ "short": list(range(31, 36)), # 31-35
83
+ "temporal": list(range(36, 42)), # 36-41
84
+ "entities": list(range(42, 46)), # 42-45
85
+ }
86
+
87
+ ACTIONS = [
88
+ "install",
89
+ "configure",
90
+ "setup",
91
+ "debug",
92
+ "deploy",
93
+ "test",
94
+ "optimize",
95
+ "migrate",
96
+ "build",
97
+ "run",
98
+ "lint",
99
+ "format",
100
+ "backup",
101
+ "restore",
102
+ "update",
103
+ "rollback",
104
+ "monitor",
105
+ "scale",
106
+ "secure",
107
+ "integrate",
108
+ "automate",
109
+ "refactor",
110
+ "initialize",
111
+ ]
112
+
113
+ TECHNOLOGIES = [
114
+ # Languages
115
+ "python",
116
+ "typescript",
117
+ "javascript",
118
+ "rust",
119
+ "golang",
120
+ "java",
121
+ "kotlin",
122
+ "swift",
123
+ "ruby",
124
+ "php",
125
+ "cpp",
126
+ "c",
127
+ "elixir",
128
+ "scala",
129
+ "clojure",
130
+ "dart",
131
+ # Frameworks/Frontend
132
+ "react",
133
+ "vue",
134
+ "angular",
135
+ "svelte",
136
+ "solid",
137
+ "htmx",
138
+ "alpine",
139
+ "nextjs",
140
+ "nuxt",
141
+ # Backend
142
+ "django",
143
+ "flask",
144
+ "fastapi",
145
+ "express",
146
+ "rails",
147
+ "spring",
148
+ "laravel",
149
+ # Infrastructure
150
+ "docker",
151
+ "kubernetes",
152
+ "terraform",
153
+ "ansible",
154
+ "jenkins",
155
+ "github-actions",
156
+ # Databases
157
+ "postgres",
158
+ "mysql",
159
+ "mongodb",
160
+ "redis",
161
+ "elasticsearch",
162
+ "sqlite",
163
+ "dynamodb",
164
+ "cassandra",
165
+ "cockroachdb",
166
+ "supabase",
167
+ "firebase",
168
+ # Tools
169
+ "git",
170
+ "nginx",
171
+ "apache",
172
+ "linux",
173
+ "aws",
174
+ "gcp",
175
+ "azure",
176
+ "vercel",
177
+ "netlify",
178
+ # Data/ML
179
+ "pandas",
180
+ "numpy",
181
+ "tensorflow",
182
+ "pytorch",
183
+ "scikit-learn",
184
+ "jupyter",
185
+ "spark",
186
+ "kafka",
187
+ "airflow",
188
+ "dbt",
189
+ ]
190
+
191
+ TECHNOLOGIES_2 = [
192
+ "docker",
193
+ "kubernetes",
194
+ "postgres",
195
+ "mysql",
196
+ "redis",
197
+ "mongodb",
198
+ "aws",
199
+ "gcp",
200
+ "react",
201
+ "vue",
202
+ "angular",
203
+ "python",
204
+ "javascript",
205
+ "typescript",
206
+ "github-actions",
207
+ "gitlab-ci",
208
+ "jenkins",
209
+ "terraform",
210
+ "ansible",
211
+ ]
212
+
213
+ CONCEPTS = [
214
+ "authentication",
215
+ "caching",
216
+ "logging",
217
+ "testing",
218
+ "deployment",
219
+ "API",
220
+ "database",
221
+ "security",
222
+ "monitoring",
223
+ "performance",
224
+ "scalability",
225
+ "reliability",
226
+ "observability",
227
+ "microservices",
228
+ "serverless",
229
+ "virtualization",
230
+ "containerization",
231
+ "orchestration",
232
+ "CI/CD",
233
+ "version control",
234
+ "dependency injection",
235
+ "event sourcing",
236
+ "CQRS",
237
+ "load balancing",
238
+ "rate limiting",
239
+ "circuit breaker",
240
+ "retry logic",
241
+ "idempotency",
242
+ ]
243
+
244
+ USE_CASES = [
245
+ "production",
246
+ "development",
247
+ "CI/CD",
248
+ "local",
249
+ "cloud",
250
+ "staging",
251
+ "testing",
252
+ "microservices",
253
+ "serverless",
254
+ "hybrid",
255
+ "multi-tenant",
256
+ "high-availability",
257
+ "real-time",
258
+ "batch processing",
259
+ "stream processing",
260
+ "data pipeline",
261
+ ]
262
+
263
+ ERROR_TYPES = [
264
+ "connection",
265
+ "timeout",
266
+ "permission",
267
+ "memory",
268
+ "syntax",
269
+ "runtime",
270
+ "configuration",
271
+ "dependency",
272
+ "network",
273
+ "authentication",
274
+ "authorization",
275
+ "validation",
276
+ "concurrency",
277
+ "deadlock",
278
+ "resource",
279
+ "quota",
280
+ ]
281
+
282
+ TOPICS = [
283
+ "productivity",
284
+ "workflow",
285
+ "architecture",
286
+ "design",
287
+ "performance",
288
+ "security",
289
+ "scalability",
290
+ "reliability",
291
+ "observability",
292
+ "maintainability",
293
+ "testing",
294
+ "documentation",
295
+ "refactoring",
296
+ "debugging",
297
+ "optimization",
298
+ "best practices",
299
+ "patterns",
300
+ "anti-patterns",
301
+ "trade-offs",
302
+ "decision making",
303
+ ]
304
+
305
+ KEYWORDS = [
306
+ "auth",
307
+ "config",
308
+ "setup",
309
+ "api",
310
+ "cache",
311
+ "log",
312
+ "test",
313
+ "debug",
314
+ "env",
315
+ "vars",
316
+ "secrets",
317
+ "tokens",
318
+ "headers",
319
+ "params",
320
+ "query",
321
+ "body",
322
+ "route",
323
+ "middleware",
324
+ "handler",
325
+ "controller",
326
+ "model",
327
+ "view",
328
+ "template",
329
+ "migration",
330
+ "seed",
331
+ "fixture",
332
+ "mock",
333
+ "stub",
334
+ "spy",
335
+ "fake",
336
+ "build",
337
+ "bundle",
338
+ "compile",
339
+ "transpile",
340
+ "minify",
341
+ "optimize",
342
+ "deploy",
343
+ "release",
344
+ "rollback",
345
+ "promote",
346
+ "freeze",
347
+ "thaw",
348
+ "pull",
349
+ "push",
350
+ "commit",
351
+ "merge",
352
+ "rebase",
353
+ "cherry-pick",
354
+ "stash",
355
+ "up",
356
+ "down",
357
+ "scale",
358
+ "restart",
359
+ "reload",
360
+ "refresh",
361
+ "flush",
362
+ "cron",
363
+ "queue",
364
+ "job",
365
+ "worker",
366
+ "scheduler",
367
+ "trigger",
368
+ "webhook",
369
+ "alert",
370
+ "metric",
371
+ "trace",
372
+ "span",
373
+ "event",
374
+ "incident",
375
+ "oncall",
376
+ ]
377
+
378
+ MODIFIERS = [
379
+ "best",
380
+ "fast",
381
+ "simple",
382
+ "advanced",
383
+ "secure",
384
+ "quick",
385
+ "easy",
386
+ "proper",
387
+ "correct",
388
+ "safe",
389
+ "efficient",
390
+ "reliable",
391
+ "robust",
392
+ "latest",
393
+ "recent",
394
+ "new",
395
+ "old",
396
+ "legacy",
397
+ "modern",
398
+ "local",
399
+ "remote",
400
+ "global",
401
+ "shared",
402
+ "private",
403
+ "public",
404
+ ]
405
+
406
+ NAMED_ENTITIES = [
407
+ "React",
408
+ "Vue",
409
+ "Angular",
410
+ "Docker",
411
+ "Kubernetes",
412
+ "AWS",
413
+ "GCP",
414
+ "GitHub",
415
+ "GitLab",
416
+ "Vercel",
417
+ "Netlify",
418
+ "Supabase",
419
+ "Firebase",
420
+ "Stripe",
421
+ "Twilio",
422
+ "SendGrid",
423
+ "Datadog",
424
+ "PagerDuty",
425
+ "Sentry",
426
+ "Terraform",
427
+ "Ansible",
428
+ "Jenkins",
429
+ "CircleCI",
430
+ "TravisCI",
431
+ ]
432
+
433
+ PERSONS = [
434
+ "Kent Beck",
435
+ "Martin Fowler",
436
+ "Robert Martin",
437
+ "Dave Thomas",
438
+ "Guido van Rossum",
439
+ "Brendan Eich",
440
+ "Ryan Dahl",
441
+ "Anders Hejlsberg",
442
+ "Linus Torvalds",
443
+ "DHH",
444
+ "Yukihiro Matsumoto",
445
+ "Rich Hickey",
446
+ ]
447
+
448
+ ORGANIZATIONS = [
449
+ "Google",
450
+ "Microsoft",
451
+ "Amazon",
452
+ "Meta",
453
+ "Apple",
454
+ "Netflix",
455
+ "Spotify",
456
+ "Stripe",
457
+ "Shopify",
458
+ "Airbnb",
459
+ "Uber",
460
+ "Lyft",
461
+ "Slack",
462
+ "Discord",
463
+ ]
464
+
465
+ PRODUCTS = [
466
+ "VS Code",
467
+ "IntelliJ",
468
+ "PyCharm",
469
+ "WebStorm",
470
+ "DataGrip",
471
+ "Postman",
472
+ "Insomnia",
473
+ "TablePlus",
474
+ "Docker Desktop",
475
+ "Lens",
476
+ "Figma",
477
+ "Sketch",
478
+ "Notion",
479
+ "Linear",
480
+ "Jira",
481
+ "Trello",
482
+ ]
483
+
484
+ SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD.
485
+
486
+ Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types:
487
+
488
+ 1. **lex** lines: Keyword variations optimized for BM25 full-text search
489
+ - Short, keyword-focused
490
+ - Good for exact term matching
491
+ - 1-3 lines
492
+
493
+ 2. **vec** lines: Semantic reformulations for vector/embedding search
494
+ - Complete phrases or questions
495
+ - Capture semantic meaning
496
+ - 1-3 lines
497
+
498
+ 3. **hyde** line: A hypothetical document passage (HyDE technique)
499
+ - A realistic passage that would answer the query
500
+ - Contains domain-specific terminology
501
+ - Written as if it's FROM a document, not ABOUT the query
502
+ - MAX 1 line
503
+
504
+ Output format (STRICT - follow exactly):
505
+ ```
506
+ hyde: A passage that would appear in a document answering this query.
507
+ lex: keyword1
508
+ lex: keyword2
509
+ vec: semantic query reformulation
510
+ ```
511
+
512
+ Rules:
513
+ - Each line must start with "lex:", "vec:", or "hyde:"
514
+ - No blank lines
515
+ - No repetition between lines
516
+ - hyde should be a realistic document excerpt, not a question
517
+ - Stay focused on the original query intent"""
518
+
519
+ USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query:
520
+
521
+ Query: {query}
522
+
523
+ Respond with ONLY the lex/vec/hyde lines, nothing else."""
524
+
525
+
526
+ # Category weights - BALANCED approach
527
+ # Tech at 15% (reasonable for QMD's technical document use case)
528
+ CATEGORY_WEIGHTS = {
529
+ "technical": 0.15, # 15% - Technical documentation
530
+ "personal": 0.10, # 10% - Personal notes, journals
531
+ "research": 0.10, # 10% - Research and learning
532
+ "short": 0.15, # 15% - Short keyword queries
533
+ "temporal": 0.10, # 10% - Temporal/recency queries (2025/2026)
534
+ "entities": 0.05, # 5% - Named entity queries
535
+ "health": 0.10, # 10% - Health & wellness
536
+ "finance": 0.10, # 10% - Finance & business
537
+ "lifestyle": 0.10, # 10% - Home, food, hobbies, travel
538
+ "education": 0.05, # 5% - Education & arts
539
+ }
540
+
541
+
542
+ def generate_random_query() -> str:
543
+ """Generate a random query from templates with category-weighted sampling."""
544
+ # Select category based on weights
545
+ categories = list(CATEGORY_WEIGHTS.keys())
546
+ weights = list(CATEGORY_WEIGHTS.values())
547
+ selected_category = random.choices(categories, weights=weights, k=1)[0]
548
+
549
+ # Select template from that category
550
+ template_idx = random.choice(TEMPLATE_CATEGORIES[selected_category])
551
+ template = QUERY_TEMPLATES[template_idx]
552
+
553
+ # Build replacements based on template type
554
+ replacements = {
555
+ "{action}": random.choice(ACTIONS),
556
+ "{technology}": random.choice(TECHNOLOGIES),
557
+ "{technology2}": random.choice(TECHNOLOGIES_2),
558
+ "{concept}": random.choice(CONCEPTS),
559
+ "{use_case}": random.choice(USE_CASES),
560
+ "{error_type}": random.choice(ERROR_TYPES),
561
+ "{topic}": random.choice(TOPICS),
562
+ "{project}": random.choice(
563
+ ["website", "app", "CLI tool", "API", "library", "service", "platform"]
564
+ ),
565
+ "{date}": random.choice(
566
+ # Emphasize 2025/2026 for recency queries (current era)
567
+ [
568
+ "2026",
569
+ "2026",
570
+ "2025",
571
+ "2025",
572
+ "January 2026",
573
+ "February 2026",
574
+ "March 2026",
575
+ "last month",
576
+ "this week",
577
+ "yesterday",
578
+ "today",
579
+ "recently",
580
+ "latest",
581
+ ]
582
+ ),
583
+ "{thing1}": random.choice(CONCEPTS[:10]),
584
+ "{thing2}": random.choice(CONCEPTS[10:] if len(CONCEPTS) > 10 else CONCEPTS),
585
+ "{skill}": random.choice(TECHNOLOGIES),
586
+ "{keyword}": random.choice(KEYWORDS),
587
+ "{modifier}": random.choice(MODIFIERS),
588
+ "{named_entity}": random.choice(NAMED_ENTITIES),
589
+ "{person}": random.choice(PERSONS),
590
+ "{organization}": random.choice(ORGANIZATIONS),
591
+ "{product}": random.choice(PRODUCTS),
592
+ }
593
+
594
+ query = template
595
+ for key, value in replacements.items():
596
+ query = query.replace(key, value)
597
+
598
+ return query
599
+
600
+
601
+ def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None:
602
+ """Generate expansion using Claude API."""
603
+ try:
604
+ response = client.messages.create(
605
+ model="claude-sonnet-4-20250514",
606
+ max_tokens=300,
607
+ system=SYSTEM_PROMPT,
608
+ messages=[
609
+ {"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)}
610
+ ],
611
+ )
612
+ return response.content[0].text.strip()
613
+ except Exception as e:
614
+ print(f"Error generating expansion for '{query}': {e}")
615
+ return None
616
+
617
+
618
+ def validate_output(output: str) -> bool:
619
+ """Validate that output follows the expected format."""
620
+ lines = output.strip().split("\n")
621
+ if not lines:
622
+ return False
623
+
624
+ has_lex = False
625
+ has_vec = False
626
+
627
+ for line in lines:
628
+ line = line.strip()
629
+ if not line:
630
+ continue
631
+ if line.startswith("lex:"):
632
+ has_lex = True
633
+ elif line.startswith("vec:"):
634
+ has_vec = True
635
+ elif line.startswith("hyde:"):
636
+ pass
637
+ else:
638
+ return False # Invalid line type
639
+
640
+ return has_lex and has_vec
641
+
642
+
643
+ def main():
644
+ parser = argparse.ArgumentParser(
645
+ description="Generate QMD query expansion training data"
646
+ )
647
+ parser.add_argument(
648
+ "--count", type=int, default=100, help="Number of examples to generate"
649
+ )
650
+ parser.add_argument(
651
+ "--output",
652
+ type=str,
653
+ default="data/qmd_expansion.jsonl",
654
+ help="Output file path",
655
+ )
656
+ parser.add_argument(
657
+ "--queries", type=str, help="Optional file with custom queries (one per line)"
658
+ )
659
+ args = parser.parse_args()
660
+
661
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
662
+ if not api_key:
663
+ print("Error: ANTHROPIC_API_KEY environment variable not set")
664
+ exit(1)
665
+
666
+ client = anthropic.Anthropic(api_key=api_key)
667
+ output_path = Path(args.output)
668
+ output_path.parent.mkdir(parents=True, exist_ok=True)
669
+
670
+ # Load custom queries if provided
671
+ custom_queries = []
672
+ if args.queries and Path(args.queries).exists():
673
+ custom_queries = Path(args.queries).read_text().strip().split("\n")
674
+ print(f"Loaded {len(custom_queries)} custom queries")
675
+
676
+ examples = []
677
+ seen_queries = set()
678
+
679
+ print(f"Generating {args.count} examples...")
680
+
681
+ i = 0
682
+ while len(examples) < args.count:
683
+ # Use custom query or generate random one
684
+ if custom_queries and i < len(custom_queries):
685
+ query = custom_queries[i].strip()
686
+ else:
687
+ query = generate_random_query()
688
+
689
+ i += 1
690
+
691
+ # Skip duplicates
692
+ if query in seen_queries:
693
+ continue
694
+ seen_queries.add(query)
695
+
696
+ # Generate expansion
697
+ output = generate_expansion(client, query)
698
+ if output and validate_output(output):
699
+ output_items = normalize_output_items(parse_output_text(output))
700
+ examples.append({"query": query, "output": output_items})
701
+ print(f"[{len(examples)}/{args.count}] {query[:50]}...")
702
+ else:
703
+ print(f" Skipped invalid output for: {query[:50]}...")
704
+
705
+ # Write output
706
+ with open(output_path, "w") as f:
707
+ for example in examples:
708
+ f.write(json.dumps(example) + "\n")
709
+
710
+ print(f"\nGenerated {len(examples)} examples to {output_path}")
711
+
712
+
713
+ if __name__ == "__main__":
714
+ main()