qmdr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +29 -0
- package/.env.example +85 -0
- package/.gitattributes +3 -0
- package/.github/workflows/release.yml +77 -0
- package/AI-SETUP.md +466 -0
- package/LICENSE +22 -0
- package/README.md +78 -0
- package/bun.lock +637 -0
- package/docs/README-zh.md +78 -0
- package/docs/refactor-checklist.md +54 -0
- package/docs/setup-openclaw.md +139 -0
- package/example-index.yml +33 -0
- package/finetune/BALANCED_DISTRIBUTION.md +157 -0
- package/finetune/DATA_IMPROVEMENTS.md +218 -0
- package/finetune/Justfile +43 -0
- package/finetune/Modelfile +16 -0
- package/finetune/README.md +299 -0
- package/finetune/SCORING.md +286 -0
- package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
- package/finetune/configs/grpo.yaml +49 -0
- package/finetune/configs/sft.yaml +42 -0
- package/finetune/configs/sft_local.yaml +40 -0
- package/finetune/convert_gguf.py +221 -0
- package/finetune/data/best_glm_prompt.txt +17 -0
- package/finetune/data/gepa_generated.prompts.json +32 -0
- package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
- package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
- package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
- package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
- package/finetune/data/qmd_expansion_locations.jsonl +64 -0
- package/finetune/data/qmd_expansion_people.jsonl +46 -0
- package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
- package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
- package/finetune/data/qmd_only_sampled.jsonl +399 -0
- package/finetune/dataset/analyze_data.py +369 -0
- package/finetune/dataset/clean_data.py +906 -0
- package/finetune/dataset/generate_balanced.py +823 -0
- package/finetune/dataset/generate_data.py +714 -0
- package/finetune/dataset/generate_data_offline.py +206 -0
- package/finetune/dataset/generate_diverse.py +441 -0
- package/finetune/dataset/generate_ollama.py +326 -0
- package/finetune/dataset/prepare_data.py +197 -0
- package/finetune/dataset/schema.py +73 -0
- package/finetune/dataset/score_data.py +115 -0
- package/finetune/dataset/validate_schema.py +104 -0
- package/finetune/eval.py +196 -0
- package/finetune/evals/queries.txt +56 -0
- package/finetune/gepa/__init__.py +1 -0
- package/finetune/gepa/best_prompt.txt +31 -0
- package/finetune/gepa/best_prompt_glm.txt +1 -0
- package/finetune/gepa/dspy_gepa.py +204 -0
- package/finetune/gepa/example.py +117 -0
- package/finetune/gepa/generate.py +129 -0
- package/finetune/gepa/gepa_outputs.jsonl +10 -0
- package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
- package/finetune/gepa/model.json +19 -0
- package/finetune/gepa/optimizer.py +70 -0
- package/finetune/gepa/score.py +84 -0
- package/finetune/jobs/eval.py +490 -0
- package/finetune/jobs/eval_common.py +354 -0
- package/finetune/jobs/eval_verbose.py +113 -0
- package/finetune/jobs/grpo.py +141 -0
- package/finetune/jobs/quantize.py +244 -0
- package/finetune/jobs/sft.py +121 -0
- package/finetune/pyproject.toml +23 -0
- package/finetune/reward.py +610 -0
- package/finetune/train.py +611 -0
- package/finetune/uv.lock +4070 -0
- package/flake.lock +61 -0
- package/flake.nix +83 -0
- package/migrate-schema.ts +162 -0
- package/package.json +56 -0
- package/skills/qmdr/SKILL.md +172 -0
- package/skills/qmdr/references/mcp-setup.md +88 -0
- package/src/app/commands/collection.ts +55 -0
- package/src/app/commands/context.ts +82 -0
- package/src/app/commands/document.ts +46 -0
- package/src/app/commands/maintenance.ts +60 -0
- package/src/app/commands/search.ts +45 -0
- package/src/app/ports/llm.ts +13 -0
- package/src/app/services/llm-service.ts +145 -0
- package/src/cli.test.ts +963 -0
- package/src/collections.ts +390 -0
- package/src/eval.test.ts +412 -0
- package/src/formatter.ts +427 -0
- package/src/llm.test.ts +559 -0
- package/src/llm.ts +1990 -0
- package/src/mcp.test.ts +889 -0
- package/src/mcp.ts +626 -0
- package/src/qmd.ts +3330 -0
- package/src/store/collections.ts +7 -0
- package/src/store/context.ts +10 -0
- package/src/store/db.ts +5 -0
- package/src/store/documents.ts +26 -0
- package/src/store/maintenance.ts +15 -0
- package/src/store/path.ts +13 -0
- package/src/store/search.ts +10 -0
- package/src/store-paths.test.ts +395 -0
- package/src/store.test.ts +2483 -0
- package/src/store.ts +2813 -0
- package/test/eval-harness.ts +223 -0
- package/tsconfig.json +29 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# /// script
|
|
3
|
+
# requires-python = ">=3.10"
|
|
4
|
+
# dependencies = [
|
|
5
|
+
# "datasets",
|
|
6
|
+
# ]
|
|
7
|
+
# ///
|
|
8
|
+
"""
|
|
9
|
+
Generate QMD training data by transforming s-emanuilov/query-expansion dataset
|
|
10
|
+
and adding synthetic hyde passages. No API calls needed.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import random
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from dataset.schema import normalize_output_items, parse_output_text
|
|
18
|
+
|
|
19
|
+
# HyDE passage templates for different query types
|
|
20
|
+
HYDE_TEMPLATES = {
|
|
21
|
+
"how_to": [
|
|
22
|
+
"To {action}, you need to {steps}. This can be done by {method}.",
|
|
23
|
+
"The recommended way to {action} is to first {step1}, then {step2}.",
|
|
24
|
+
"{Topic} can be achieved by {method}. Make sure to {consideration}.",
|
|
25
|
+
],
|
|
26
|
+
"what_is": [
|
|
27
|
+
"{Topic} is a {category} that {description}. It is commonly used for {use_case}.",
|
|
28
|
+
"{Topic} refers to {definition}. Key features include {features}.",
|
|
29
|
+
],
|
|
30
|
+
"config": [
|
|
31
|
+
"To configure {topic}, set the {setting} option to {value}. You can also customize {other}.",
|
|
32
|
+
"Configuration for {topic} is done in the {file} file. Key settings include {settings}.",
|
|
33
|
+
],
|
|
34
|
+
"error": [
|
|
35
|
+
"The {error} error occurs when {cause}. To fix this, {solution}.",
|
|
36
|
+
"If you encounter {error}, check that {check}. Common solutions include {solutions}.",
|
|
37
|
+
],
|
|
38
|
+
"general": [
|
|
39
|
+
"{Topic} provides {benefit} for {use_case}. It works by {mechanism}.",
|
|
40
|
+
"When working with {topic}, consider {considerations}. Best practices include {practices}.",
|
|
41
|
+
],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def classify_query(query: str) -> str:
|
|
46
|
+
"""Classify query type for hyde template selection."""
|
|
47
|
+
q = query.lower()
|
|
48
|
+
if any(
|
|
49
|
+
w in q for w in ["how to", "how do", "setup", "install", "configure", "create"]
|
|
50
|
+
):
|
|
51
|
+
return "how_to"
|
|
52
|
+
if any(w in q for w in ["what is", "what are", "definition", "meaning"]):
|
|
53
|
+
return "what_is"
|
|
54
|
+
if any(w in q for w in ["config", "setting", "option"]):
|
|
55
|
+
return "config"
|
|
56
|
+
if any(w in q for w in ["error", "issue", "problem", "fix", "debug"]):
|
|
57
|
+
return "error"
|
|
58
|
+
return "general"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_topic(query: str) -> str:
|
|
62
|
+
"""Extract main topic from query."""
|
|
63
|
+
# Remove common prefixes
|
|
64
|
+
for prefix in [
|
|
65
|
+
"how to ",
|
|
66
|
+
"how do i ",
|
|
67
|
+
"what is ",
|
|
68
|
+
"what are ",
|
|
69
|
+
"configure ",
|
|
70
|
+
"setup ",
|
|
71
|
+
]:
|
|
72
|
+
if query.lower().startswith(prefix):
|
|
73
|
+
return query[len(prefix) :].strip()
|
|
74
|
+
return query
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def generate_hyde(query: str, expansions: list[str]) -> str:
|
|
78
|
+
"""Generate a hypothetical document passage by combining expansions naturally."""
|
|
79
|
+
topic = extract_topic(query)
|
|
80
|
+
query_type = classify_query(query)
|
|
81
|
+
|
|
82
|
+
# Use the longest, most descriptive expansion as the base
|
|
83
|
+
sorted_exp = sorted(expansions, key=len, reverse=True)
|
|
84
|
+
main_exp = sorted_exp[0] if sorted_exp else topic
|
|
85
|
+
|
|
86
|
+
# Build a natural passage based on query type
|
|
87
|
+
if query_type == "how_to":
|
|
88
|
+
templates = [
|
|
89
|
+
f"To {topic}, start by reviewing the requirements and dependencies. {main_exp.capitalize()} is the recommended approach. Make sure all prerequisites are met before proceeding.",
|
|
90
|
+
f"The process of {topic} involves several steps. First, {main_exp}. Follow the official documentation for detailed instructions.",
|
|
91
|
+
f"When you need to {topic}, the most effective method is to {main_exp}. This ensures compatibility and follows best practices.",
|
|
92
|
+
]
|
|
93
|
+
elif query_type == "what_is":
|
|
94
|
+
templates = [
|
|
95
|
+
f"{topic.capitalize()} refers to {main_exp}. It is widely used in various applications and provides significant benefits.",
|
|
96
|
+
f"The concept of {topic} encompasses {main_exp}. Understanding this is essential for effective implementation.",
|
|
97
|
+
f"{topic.capitalize()} is defined as {main_exp}. This plays a crucial role in modern development practices.",
|
|
98
|
+
]
|
|
99
|
+
elif query_type == "config":
|
|
100
|
+
templates = [
|
|
101
|
+
f"Configuration for {topic} requires setting the appropriate parameters. {main_exp.capitalize()} should be adjusted based on your specific requirements.",
|
|
102
|
+
f"To configure {topic}, modify the settings in your configuration file. Key options include those related to {main_exp}.",
|
|
103
|
+
f"The {topic} configuration can be customized by {main_exp}. Default values work for most use cases.",
|
|
104
|
+
]
|
|
105
|
+
elif query_type == "error":
|
|
106
|
+
templates = [
|
|
107
|
+
f"The {topic} issue typically occurs when dependencies are misconfigured. To resolve this, {main_exp}. Check your environment settings.",
|
|
108
|
+
f"If you encounter problems with {topic}, verify that {main_exp}. Common solutions include updating dependencies and checking permissions.",
|
|
109
|
+
f"Debugging {topic} requires understanding the root cause. Often, {main_exp} resolves the issue. Review logs for details.",
|
|
110
|
+
]
|
|
111
|
+
else:
|
|
112
|
+
templates = [
|
|
113
|
+
f"{topic.capitalize()} is an important concept that relates to {main_exp}. It provides functionality for various use cases in software development.",
|
|
114
|
+
f"Understanding {topic} is essential for modern development. Key aspects include {main_exp}. This knowledge helps in building robust applications.",
|
|
115
|
+
f"The topic of {topic} covers {main_exp}. Proper implementation follows established patterns and best practices.",
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
return random.choice(templates)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def transform_to_qmd_format(query: str, expansions: list[str]) -> str:
|
|
122
|
+
"""Transform s-emanuilov format to QMD lex/vec/hyde format."""
|
|
123
|
+
lines = []
|
|
124
|
+
|
|
125
|
+
# Generate hyde line first
|
|
126
|
+
hyde = generate_hyde(query, expansions)
|
|
127
|
+
lines.append(f"hyde: {hyde}")
|
|
128
|
+
|
|
129
|
+
# Generate lex lines (keyword-focused, shorter)
|
|
130
|
+
lex_candidates = []
|
|
131
|
+
for exp in expansions:
|
|
132
|
+
# Shorter versions for lex
|
|
133
|
+
words = exp.split()
|
|
134
|
+
if len(words) <= 4:
|
|
135
|
+
lex_candidates.append(exp)
|
|
136
|
+
else:
|
|
137
|
+
# Take key phrases
|
|
138
|
+
lex_candidates.append(" ".join(words[:3]))
|
|
139
|
+
|
|
140
|
+
# Add 1-2 lex lines
|
|
141
|
+
for lex in lex_candidates[:2]:
|
|
142
|
+
if lex.lower() != query.lower():
|
|
143
|
+
lines.append(f"lex: {lex}")
|
|
144
|
+
|
|
145
|
+
# Generate vec lines (semantic, complete phrases)
|
|
146
|
+
vec_candidates = [exp for exp in expansions if len(exp.split()) >= 3]
|
|
147
|
+
if not vec_candidates:
|
|
148
|
+
vec_candidates = expansions
|
|
149
|
+
|
|
150
|
+
# Add 1-2 vec lines
|
|
151
|
+
for vec in vec_candidates[:2]:
|
|
152
|
+
if vec.lower() != query.lower():
|
|
153
|
+
lines.append(f"vec: {vec}")
|
|
154
|
+
|
|
155
|
+
return "\n".join(lines)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main():
|
|
159
|
+
try:
|
|
160
|
+
from datasets import load_dataset
|
|
161
|
+
except ImportError:
|
|
162
|
+
print("Installing datasets...")
|
|
163
|
+
import subprocess
|
|
164
|
+
|
|
165
|
+
subprocess.run(["uv", "pip", "install", "datasets"], check=True)
|
|
166
|
+
from datasets import load_dataset
|
|
167
|
+
|
|
168
|
+
print("Loading s-emanuilov/query-expansion dataset...")
|
|
169
|
+
dataset = load_dataset("s-emanuilov/query-expansion", split="train")
|
|
170
|
+
|
|
171
|
+
print(f"Loaded {len(dataset)} examples")
|
|
172
|
+
|
|
173
|
+
# Transform each example
|
|
174
|
+
output_path = Path("data/qmd_expansion.jsonl")
|
|
175
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
examples = []
|
|
178
|
+
for item in dataset:
|
|
179
|
+
query = item["query"]
|
|
180
|
+
expansions = item["expansions"]
|
|
181
|
+
|
|
182
|
+
output = transform_to_qmd_format(query, expansions)
|
|
183
|
+
output_items = normalize_output_items(parse_output_text(output))
|
|
184
|
+
examples.append({"query": query, "output": output_items})
|
|
185
|
+
|
|
186
|
+
# Shuffle
|
|
187
|
+
random.seed(42)
|
|
188
|
+
random.shuffle(examples)
|
|
189
|
+
|
|
190
|
+
# Write output
|
|
191
|
+
with open(output_path, "w") as f:
|
|
192
|
+
for ex in examples:
|
|
193
|
+
f.write(json.dumps(ex) + "\n")
|
|
194
|
+
|
|
195
|
+
print(f"Generated {len(examples)} examples to {output_path}")
|
|
196
|
+
|
|
197
|
+
# Show sample
|
|
198
|
+
print("\nSample output:")
|
|
199
|
+
print("-" * 50)
|
|
200
|
+
sample = examples[0]
|
|
201
|
+
print(f"Input: {sample['query']}")
|
|
202
|
+
print(f"Output: {sample['output']}")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
if __name__ == "__main__":
|
|
206
|
+
main()
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Generate diverse QMD training examples for underrepresented categories.
|
|
4
|
+
|
|
5
|
+
This script creates additional training examples focused on:
|
|
6
|
+
- Trivia, Geography, Philosophy, History (as requested)
|
|
7
|
+
- Temporal/Recency queries (important for evals)
|
|
8
|
+
- Named entity queries (critical for entity preservation scoring)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import random
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
|
|
16
|
+
from dataset.schema import normalize_output_items, parse_output_text
|
|
17
|
+
|
|
18
|
+
# Additional diverse query categories
|
|
19
|
+
TRIVIA_QUERIES = [
|
|
20
|
+
"world capitals quiz",
|
|
21
|
+
"trivia facts about space",
|
|
22
|
+
"did you know history",
|
|
23
|
+
"random science facts",
|
|
24
|
+
"famous inventions timeline",
|
|
25
|
+
"world records list",
|
|
26
|
+
"fun geography facts",
|
|
27
|
+
"historical trivia questions",
|
|
28
|
+
"animal trivia facts",
|
|
29
|
+
"sports trivia records",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
GEOGRAPHY_QUERIES = [
|
|
33
|
+
"largest countries by area",
|
|
34
|
+
"rivers that cross multiple countries",
|
|
35
|
+
"highest mountain peaks",
|
|
36
|
+
"desert climate zones",
|
|
37
|
+
"island nations list",
|
|
38
|
+
"capital cities europe",
|
|
39
|
+
"population by continent",
|
|
40
|
+
"time zones map",
|
|
41
|
+
"latitude longitude coordinates",
|
|
42
|
+
"borders between countries",
|
|
43
|
+
"ocean currents patterns",
|
|
44
|
+
"tectonic plate boundaries",
|
|
45
|
+
"climate zones earth",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
PHILOSOPHY_QUERIES = [
|
|
49
|
+
"stoicism daily practice",
|
|
50
|
+
"existentialism meaning life",
|
|
51
|
+
"utilitarianism ethics explained",
|
|
52
|
+
"kant categorical imperative",
|
|
53
|
+
"free will determinism debate",
|
|
54
|
+
"nietzsche will to power",
|
|
55
|
+
"socrates method questioning",
|
|
56
|
+
"plato theory forms",
|
|
57
|
+
"aristotle virtue ethics",
|
|
58
|
+
"descartes cogito ergo sum",
|
|
59
|
+
"logic propositional calculus",
|
|
60
|
+
"epistemology knowledge theory",
|
|
61
|
+
"metaphysics existence reality",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
HISTORY_QUERIES = [
|
|
65
|
+
"ancient civilizations timeline",
|
|
66
|
+
"roman empire fall reasons",
|
|
67
|
+
"medieval period events",
|
|
68
|
+
"renaissance art movement",
|
|
69
|
+
"industrial revolution inventions",
|
|
70
|
+
"world war i causes",
|
|
71
|
+
"cold war key events",
|
|
72
|
+
"french revolution timeline",
|
|
73
|
+
"american civil war battles",
|
|
74
|
+
"egyptian pharaohs dynasty",
|
|
75
|
+
"bronze age collapse",
|
|
76
|
+
"byzantine empire history",
|
|
77
|
+
"vietnam war timeline",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
SCIENCE_QUERIES = [
|
|
81
|
+
"quantum mechanics basics",
|
|
82
|
+
"theory of relativity explained",
|
|
83
|
+
"dna structure discovery",
|
|
84
|
+
"photosynthesis process steps",
|
|
85
|
+
"black holes physics",
|
|
86
|
+
"plate tectonics theory",
|
|
87
|
+
"evolution natural selection",
|
|
88
|
+
"periodic table elements",
|
|
89
|
+
"cell biology fundamentals",
|
|
90
|
+
"climate change evidence",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
ARTS_CULTURE_QUERIES = [
|
|
94
|
+
"impressionist painters list",
|
|
95
|
+
"shakespeare plays summary",
|
|
96
|
+
"classical music composers",
|
|
97
|
+
"modern art movements",
|
|
98
|
+
"film noir characteristics",
|
|
99
|
+
"jazz history origins",
|
|
100
|
+
"renaissance sculpture techniques",
|
|
101
|
+
"photography composition rules",
|
|
102
|
+
"poetry forms haiku",
|
|
103
|
+
"baroque art characteristics",
|
|
104
|
+
"street art graffiti history",
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
HEALTH_MEDICINE_QUERIES = [
|
|
108
|
+
"symptoms of vitamin deficiency",
|
|
109
|
+
"how vaccines work immune system",
|
|
110
|
+
"blood pressure normal range",
|
|
111
|
+
"sleep hygiene tips",
|
|
112
|
+
"intermittent fasting benefits",
|
|
113
|
+
"anxiety coping strategies",
|
|
114
|
+
"stretching exercises back pain",
|
|
115
|
+
"heart disease prevention",
|
|
116
|
+
"diabetes type 2 management",
|
|
117
|
+
"meditation mental health",
|
|
118
|
+
"nutrition macros explained",
|
|
119
|
+
"first aid basics",
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
BUSINESS_FINANCE_QUERIES = [
|
|
123
|
+
"compound interest calculator",
|
|
124
|
+
"stock market basics beginners",
|
|
125
|
+
"startup funding stages",
|
|
126
|
+
"tax deductions small business",
|
|
127
|
+
"budgeting methods 50 30 20",
|
|
128
|
+
"cryptocurrency explained simply",
|
|
129
|
+
"inflation effects on savings",
|
|
130
|
+
"retirement planning strategies",
|
|
131
|
+
"passive income ideas",
|
|
132
|
+
"venture capital vs angel investors",
|
|
133
|
+
"balance sheet basics",
|
|
134
|
+
"supply chain management",
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
SPORTS_QUERIES = [
|
|
138
|
+
"marathon training schedule",
|
|
139
|
+
"weightlifting proper form",
|
|
140
|
+
"swimming stroke techniques",
|
|
141
|
+
"tennis serve mechanics",
|
|
142
|
+
"basketball dribbling drills",
|
|
143
|
+
"soccer formations tactics",
|
|
144
|
+
"golf swing fundamentals",
|
|
145
|
+
"yoga poses beginners",
|
|
146
|
+
"running injury prevention",
|
|
147
|
+
"cycling gear ratios",
|
|
148
|
+
"rock climbing grades",
|
|
149
|
+
"surfing wave types",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
TRAVEL_QUERIES = [
|
|
153
|
+
"best time visit japan",
|
|
154
|
+
"travel packing checklist",
|
|
155
|
+
"budget backpacking europe",
|
|
156
|
+
"visa requirements usa",
|
|
157
|
+
"jet lag remedies",
|
|
158
|
+
"road trip planning tips",
|
|
159
|
+
"solo travel safety",
|
|
160
|
+
"airport security rules",
|
|
161
|
+
"travel insurance coverage",
|
|
162
|
+
"language apps learning",
|
|
163
|
+
"hostel vs hotel comparison",
|
|
164
|
+
"travel photography tips",
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
FOOD_COOKING_QUERIES = [
|
|
168
|
+
"bread baking techniques",
|
|
169
|
+
"knife skills basics",
|
|
170
|
+
"fermentation at home",
|
|
171
|
+
"meal prep weekly",
|
|
172
|
+
"spice combinations guide",
|
|
173
|
+
"pasta making fresh",
|
|
174
|
+
"coffee brewing methods",
|
|
175
|
+
"wine pairing basics",
|
|
176
|
+
"vegetarian protein sources",
|
|
177
|
+
"food storage guidelines",
|
|
178
|
+
"sourdough starter maintenance",
|
|
179
|
+
"grilling temperature chart",
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
PSYCHOLOGY_QUERIES = [
|
|
183
|
+
"cognitive biases list",
|
|
184
|
+
"attachment theory styles",
|
|
185
|
+
"maslow hierarchy needs",
|
|
186
|
+
"growth mindset vs fixed",
|
|
187
|
+
"emotional intelligence components",
|
|
188
|
+
"memory techniques mnemonics",
|
|
189
|
+
"habit formation science",
|
|
190
|
+
"stress response fight flight",
|
|
191
|
+
"personality types myers briggs",
|
|
192
|
+
"motivation intrinsic extrinsic",
|
|
193
|
+
"decision making psychology",
|
|
194
|
+
"procrastination causes solutions",
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
ENVIRONMENT_NATURE_QUERIES = [
|
|
198
|
+
"renewable energy types",
|
|
199
|
+
"carbon footprint reduction",
|
|
200
|
+
"composting basics home",
|
|
201
|
+
"endangered species list",
|
|
202
|
+
"recycling symbols meaning",
|
|
203
|
+
"ocean plastic pollution",
|
|
204
|
+
"deforestation effects",
|
|
205
|
+
"sustainable living tips",
|
|
206
|
+
"wildlife conservation efforts",
|
|
207
|
+
"solar panel installation",
|
|
208
|
+
"water conservation methods",
|
|
209
|
+
"biodiversity importance",
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
MATH_QUERIES = [
|
|
213
|
+
"calculus derivatives explained",
|
|
214
|
+
"probability basics statistics",
|
|
215
|
+
"linear algebra matrices",
|
|
216
|
+
"geometry proofs theorems",
|
|
217
|
+
"logarithms rules properties",
|
|
218
|
+
"trigonometry identities",
|
|
219
|
+
"set theory basics",
|
|
220
|
+
"prime numbers properties",
|
|
221
|
+
"fractions decimals conversion",
|
|
222
|
+
"algebra equations solving",
|
|
223
|
+
"graph theory fundamentals",
|
|
224
|
+
"combinatorics permutations",
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
LANGUAGE_QUERIES = [
|
|
228
|
+
"spanish verb conjugation",
|
|
229
|
+
"japanese hiragana katakana",
|
|
230
|
+
"french pronunciation rules",
|
|
231
|
+
"german cases grammar",
|
|
232
|
+
"mandarin tones guide",
|
|
233
|
+
"latin phrases common",
|
|
234
|
+
"arabic alphabet basics",
|
|
235
|
+
"english idioms meanings",
|
|
236
|
+
"sign language basics",
|
|
237
|
+
"etymology word origins",
|
|
238
|
+
"grammar punctuation rules",
|
|
239
|
+
"writing style guides",
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
DIY_CRAFTS_QUERIES = [
|
|
243
|
+
"woodworking joints types",
|
|
244
|
+
"knitting patterns beginners",
|
|
245
|
+
"home repair basics",
|
|
246
|
+
"sewing machine threading",
|
|
247
|
+
"painting techniques acrylic",
|
|
248
|
+
"pottery wheel basics",
|
|
249
|
+
"electronics soldering guide",
|
|
250
|
+
"gardening soil preparation",
|
|
251
|
+
"candle making supplies",
|
|
252
|
+
"leather crafting tools",
|
|
253
|
+
"origami folding instructions",
|
|
254
|
+
"furniture restoration tips",
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
# Temporal/Recency queries (matches evals/queries.txt requirements)
|
|
258
|
+
TEMPORAL_TEMPLATES = [
|
|
259
|
+
"latest {topic} updates",
|
|
260
|
+
"recent {topic} changes {year}",
|
|
261
|
+
"what changed in {topic} {year}",
|
|
262
|
+
"{topic} changelog {year}",
|
|
263
|
+
"{topic} new features {year}",
|
|
264
|
+
"{topic} latest version release",
|
|
265
|
+
"{topic} recent news {month}",
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
TEMPORAL_TOPICS = [
|
|
269
|
+
"Shopify",
|
|
270
|
+
"React",
|
|
271
|
+
"Kubernetes",
|
|
272
|
+
"Docker",
|
|
273
|
+
"TypeScript",
|
|
274
|
+
"Python",
|
|
275
|
+
"AWS",
|
|
276
|
+
"GitHub",
|
|
277
|
+
"Next.js",
|
|
278
|
+
"Vue",
|
|
279
|
+
"AI",
|
|
280
|
+
"machine learning",
|
|
281
|
+
"climate tech",
|
|
282
|
+
"space exploration",
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
# Named entity queries (critical for entity preservation testing)
|
|
286
|
+
NAMED_ENTITY_QUERIES = [
|
|
287
|
+
"who is TDS motorsports",
|
|
288
|
+
"React hooks tutorial",
|
|
289
|
+
"Docker container networking",
|
|
290
|
+
"Kubernetes pod deployment",
|
|
291
|
+
"AWS Lambda functions setup",
|
|
292
|
+
"Stripe payment integration",
|
|
293
|
+
"GitHub Actions workflow",
|
|
294
|
+
"Vercel deployment guide",
|
|
295
|
+
"Supabase auth configuration",
|
|
296
|
+
"Twilio SMS API",
|
|
297
|
+
"Datadog monitoring setup",
|
|
298
|
+
"Sentry error tracking",
|
|
299
|
+
"Terraform AWS provider",
|
|
300
|
+
"Ansible playbook examples",
|
|
301
|
+
]
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# Generate temporal queries with recent dates
|
|
305
|
+
def generate_temporal_queries():
|
|
306
|
+
queries = []
|
|
307
|
+
current_year = datetime.now().year
|
|
308
|
+
months = [
|
|
309
|
+
"January",
|
|
310
|
+
"February",
|
|
311
|
+
"March",
|
|
312
|
+
"April",
|
|
313
|
+
"May",
|
|
314
|
+
"June",
|
|
315
|
+
"July",
|
|
316
|
+
"August",
|
|
317
|
+
"September",
|
|
318
|
+
"October",
|
|
319
|
+
"November",
|
|
320
|
+
"December",
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
for template in TEMPORAL_TEMPLATES:
|
|
324
|
+
for topic in TEMPORAL_TOPICS:
|
|
325
|
+
if "{year}" in template:
|
|
326
|
+
# Use current year and previous year
|
|
327
|
+
for year in [current_year, current_year - 1]:
|
|
328
|
+
queries.append(template.format(topic=topic, year=year))
|
|
329
|
+
elif "{month}" in template:
|
|
330
|
+
# Use recent months
|
|
331
|
+
for month in months[-3:]: # Last 3 months
|
|
332
|
+
queries.append(template.format(topic=topic, month=month))
|
|
333
|
+
else:
|
|
334
|
+
queries.append(template.format(topic=topic))
|
|
335
|
+
|
|
336
|
+
return list(set(queries)) # Remove duplicates
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def generate_expansion(query: str) -> str:
|
|
340
|
+
"""Generate a realistic expansion for a query."""
|
|
341
|
+
# This is a template-based generator - in production, use Claude API
|
|
342
|
+
lex_variations = [
|
|
343
|
+
f"{query} guide",
|
|
344
|
+
f"{query} documentation",
|
|
345
|
+
f"{query} tutorial",
|
|
346
|
+
f"{query} examples",
|
|
347
|
+
f"{query} best practices",
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
vec_variations = [
|
|
351
|
+
f"how to {query}",
|
|
352
|
+
f"guide for {query}",
|
|
353
|
+
f"learn about {query}",
|
|
354
|
+
f"understanding {query}",
|
|
355
|
+
f"complete {query} reference",
|
|
356
|
+
]
|
|
357
|
+
|
|
358
|
+
# Select 2-3 lex and 2 vec variations
|
|
359
|
+
selected_lex = random.sample(lex_variations, min(3, len(lex_variations)))
|
|
360
|
+
selected_vec = random.sample(vec_variations, min(2, len(vec_variations)))
|
|
361
|
+
|
|
362
|
+
# Generate hyde passage
|
|
363
|
+
hyde = f"This comprehensive guide covers everything you need to know about {query}. It includes practical examples, best practices, and troubleshooting tips for beginners and advanced users alike."
|
|
364
|
+
|
|
365
|
+
output_lines = []
|
|
366
|
+
for lex in selected_lex:
|
|
367
|
+
output_lines.append(f"lex: {lex}")
|
|
368
|
+
for vec in selected_vec:
|
|
369
|
+
output_lines.append(f"vec: {vec}")
|
|
370
|
+
output_lines.append(f"hyde: {hyde}")
|
|
371
|
+
|
|
372
|
+
return "\n".join(output_lines)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def main():
|
|
376
|
+
"""Generate diverse examples and append to training data."""
|
|
377
|
+
output_file = Path("data/qmd_expansion_diverse_addon.jsonl")
|
|
378
|
+
|
|
379
|
+
all_queries = (
|
|
380
|
+
TRIVIA_QUERIES
|
|
381
|
+
+ GEOGRAPHY_QUERIES
|
|
382
|
+
+ PHILOSOPHY_QUERIES
|
|
383
|
+
+ HISTORY_QUERIES
|
|
384
|
+
+ SCIENCE_QUERIES
|
|
385
|
+
+ ARTS_CULTURE_QUERIES
|
|
386
|
+
+ HEALTH_MEDICINE_QUERIES
|
|
387
|
+
+ BUSINESS_FINANCE_QUERIES
|
|
388
|
+
+ SPORTS_QUERIES
|
|
389
|
+
+ TRAVEL_QUERIES
|
|
390
|
+
+ FOOD_COOKING_QUERIES
|
|
391
|
+
+ PSYCHOLOGY_QUERIES
|
|
392
|
+
+ ENVIRONMENT_NATURE_QUERIES
|
|
393
|
+
+ MATH_QUERIES
|
|
394
|
+
+ LANGUAGE_QUERIES
|
|
395
|
+
+ DIY_CRAFTS_QUERIES
|
|
396
|
+
+ generate_temporal_queries()
|
|
397
|
+
+ NAMED_ENTITY_QUERIES
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
print(f"Generating {len(all_queries)} diverse training examples...")
|
|
401
|
+
print(f" - Trivia: {len(TRIVIA_QUERIES)}")
|
|
402
|
+
print(f" - Geography: {len(GEOGRAPHY_QUERIES)}")
|
|
403
|
+
print(f" - Philosophy: {len(PHILOSOPHY_QUERIES)}")
|
|
404
|
+
print(f" - History: {len(HISTORY_QUERIES)}")
|
|
405
|
+
print(f" - Science: {len(SCIENCE_QUERIES)}")
|
|
406
|
+
print(f" - Arts/Culture: {len(ARTS_CULTURE_QUERIES)}")
|
|
407
|
+
print(f" - Health/Medicine: {len(HEALTH_MEDICINE_QUERIES)}")
|
|
408
|
+
print(f" - Business/Finance: {len(BUSINESS_FINANCE_QUERIES)}")
|
|
409
|
+
print(f" - Sports: {len(SPORTS_QUERIES)}")
|
|
410
|
+
print(f" - Travel: {len(TRAVEL_QUERIES)}")
|
|
411
|
+
print(f" - Food/Cooking: {len(FOOD_COOKING_QUERIES)}")
|
|
412
|
+
print(f" - Psychology: {len(PSYCHOLOGY_QUERIES)}")
|
|
413
|
+
print(f" - Environment: {len(ENVIRONMENT_NATURE_QUERIES)}")
|
|
414
|
+
print(f" - Math: {len(MATH_QUERIES)}")
|
|
415
|
+
print(f" - Language: {len(LANGUAGE_QUERIES)}")
|
|
416
|
+
print(f" - DIY/Crafts: {len(DIY_CRAFTS_QUERIES)}")
|
|
417
|
+
print(f" - Temporal: {len(generate_temporal_queries())}")
|
|
418
|
+
print(f" - Named Entities: {len(NAMED_ENTITY_QUERIES)}")
|
|
419
|
+
|
|
420
|
+
examples = []
|
|
421
|
+
for query in all_queries:
|
|
422
|
+
expansion = generate_expansion(query)
|
|
423
|
+
output_items = normalize_output_items(parse_output_text(expansion))
|
|
424
|
+
examples.append(
|
|
425
|
+
{"query": query, "output": output_items, "category": "diverse_addon"}
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Write to file
|
|
429
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
430
|
+
with open(output_file, "w") as f:
|
|
431
|
+
for ex in examples:
|
|
432
|
+
f.write(json.dumps(ex) + "\n")
|
|
433
|
+
|
|
434
|
+
print(f"\nSaved {len(examples)} diverse examples to {output_file}")
|
|
435
|
+
print("\nTo use these examples:")
|
|
436
|
+
print(f" cat {output_file} >> data/qmd_expansion_v2.jsonl")
|
|
437
|
+
print(" uv run dataset/prepare_data.py --add-short 2")
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
if __name__ == "__main__":
|
|
441
|
+
main()
|