qmdr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/.claude-plugin/marketplace.json +29 -0
  2. package/.env.example +85 -0
  3. package/.gitattributes +3 -0
  4. package/.github/workflows/release.yml +77 -0
  5. package/AI-SETUP.md +466 -0
  6. package/LICENSE +22 -0
  7. package/README.md +78 -0
  8. package/bun.lock +637 -0
  9. package/docs/README-zh.md +78 -0
  10. package/docs/refactor-checklist.md +54 -0
  11. package/docs/setup-openclaw.md +139 -0
  12. package/example-index.yml +33 -0
  13. package/finetune/BALANCED_DISTRIBUTION.md +157 -0
  14. package/finetune/DATA_IMPROVEMENTS.md +218 -0
  15. package/finetune/Justfile +43 -0
  16. package/finetune/Modelfile +16 -0
  17. package/finetune/README.md +299 -0
  18. package/finetune/SCORING.md +286 -0
  19. package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
  20. package/finetune/configs/grpo.yaml +49 -0
  21. package/finetune/configs/sft.yaml +42 -0
  22. package/finetune/configs/sft_local.yaml +40 -0
  23. package/finetune/convert_gguf.py +221 -0
  24. package/finetune/data/best_glm_prompt.txt +17 -0
  25. package/finetune/data/gepa_generated.prompts.json +32 -0
  26. package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
  27. package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
  28. package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
  29. package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
  30. package/finetune/data/qmd_expansion_locations.jsonl +64 -0
  31. package/finetune/data/qmd_expansion_people.jsonl +46 -0
  32. package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
  33. package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
  34. package/finetune/data/qmd_only_sampled.jsonl +399 -0
  35. package/finetune/dataset/analyze_data.py +369 -0
  36. package/finetune/dataset/clean_data.py +906 -0
  37. package/finetune/dataset/generate_balanced.py +823 -0
  38. package/finetune/dataset/generate_data.py +714 -0
  39. package/finetune/dataset/generate_data_offline.py +206 -0
  40. package/finetune/dataset/generate_diverse.py +441 -0
  41. package/finetune/dataset/generate_ollama.py +326 -0
  42. package/finetune/dataset/prepare_data.py +197 -0
  43. package/finetune/dataset/schema.py +73 -0
  44. package/finetune/dataset/score_data.py +115 -0
  45. package/finetune/dataset/validate_schema.py +104 -0
  46. package/finetune/eval.py +196 -0
  47. package/finetune/evals/queries.txt +56 -0
  48. package/finetune/gepa/__init__.py +1 -0
  49. package/finetune/gepa/best_prompt.txt +31 -0
  50. package/finetune/gepa/best_prompt_glm.txt +1 -0
  51. package/finetune/gepa/dspy_gepa.py +204 -0
  52. package/finetune/gepa/example.py +117 -0
  53. package/finetune/gepa/generate.py +129 -0
  54. package/finetune/gepa/gepa_outputs.jsonl +10 -0
  55. package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
  56. package/finetune/gepa/model.json +19 -0
  57. package/finetune/gepa/optimizer.py +70 -0
  58. package/finetune/gepa/score.py +84 -0
  59. package/finetune/jobs/eval.py +490 -0
  60. package/finetune/jobs/eval_common.py +354 -0
  61. package/finetune/jobs/eval_verbose.py +113 -0
  62. package/finetune/jobs/grpo.py +141 -0
  63. package/finetune/jobs/quantize.py +244 -0
  64. package/finetune/jobs/sft.py +121 -0
  65. package/finetune/pyproject.toml +23 -0
  66. package/finetune/reward.py +610 -0
  67. package/finetune/train.py +611 -0
  68. package/finetune/uv.lock +4070 -0
  69. package/flake.lock +61 -0
  70. package/flake.nix +83 -0
  71. package/migrate-schema.ts +162 -0
  72. package/package.json +56 -0
  73. package/skills/qmdr/SKILL.md +172 -0
  74. package/skills/qmdr/references/mcp-setup.md +88 -0
  75. package/src/app/commands/collection.ts +55 -0
  76. package/src/app/commands/context.ts +82 -0
  77. package/src/app/commands/document.ts +46 -0
  78. package/src/app/commands/maintenance.ts +60 -0
  79. package/src/app/commands/search.ts +45 -0
  80. package/src/app/ports/llm.ts +13 -0
  81. package/src/app/services/llm-service.ts +145 -0
  82. package/src/cli.test.ts +963 -0
  83. package/src/collections.ts +390 -0
  84. package/src/eval.test.ts +412 -0
  85. package/src/formatter.ts +427 -0
  86. package/src/llm.test.ts +559 -0
  87. package/src/llm.ts +1990 -0
  88. package/src/mcp.test.ts +889 -0
  89. package/src/mcp.ts +626 -0
  90. package/src/qmd.ts +3330 -0
  91. package/src/store/collections.ts +7 -0
  92. package/src/store/context.ts +10 -0
  93. package/src/store/db.ts +5 -0
  94. package/src/store/documents.ts +26 -0
  95. package/src/store/maintenance.ts +15 -0
  96. package/src/store/path.ts +13 -0
  97. package/src/store/search.ts +10 -0
  98. package/src/store-paths.test.ts +395 -0
  99. package/src/store.test.ts +2483 -0
  100. package/src/store.ts +2813 -0
  101. package/test/eval-harness.ts +223 -0
  102. package/tsconfig.json +29 -0
@@ -0,0 +1,43 @@
1
+ set shell := ["bash", "-uc"]
2
+
3
+ validate:
4
+ uv run dataset/validate_schema.py
5
+ uv run dataset/score_data.py
6
+ for f in data/*.jsonl; do \
7
+ uv run dataset/analyze_data.py --input "$f" --show-examples 0; \
8
+ done
9
+
10
+ score:
11
+ uv run dataset/score_data.py
12
+
13
+ schema:
14
+ uv run dataset/validate_schema.py
15
+
16
+ analyze:
17
+ for f in data/*.jsonl; do \
18
+ uv run dataset/analyze_data.py --input "$f" --show-examples 0; \
19
+ done
20
+
21
+ prepare:
22
+ QMD_BASE_MODEL=Qwen/Qwen3-1.7B uv run dataset/prepare_data.py --seed 42
23
+
24
+ train-local:
25
+ just prepare
26
+ HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \
27
+ train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log
28
+
29
+ grpo-local:
30
+ CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \
31
+ train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log
32
+
33
+ gepa-local:
34
+ UV_CACHE_DIR=/tmp/uv-cache LITELLM_CACHE_DIR=/tmp/litellm-cache OLLAMA_API_BASE=http://localhost:11434 \
35
+ uv run python gepa/dspy_gepa.py \
36
+ --input data/qmd_expansion_v2.jsonl \
37
+ --model ollama/glm-4.7-flash:Q8_0 \
38
+ --reflection-model ollama/glm-4.7-flash:Q8_0 \
39
+ --max-metric-calls 100 --limit 20 \
40
+ --valset data/qmd_expansion_handcrafted.jsonl --val-limit 20 \
41
+ --max-tokens 512 --reflection-max-tokens 512 \
42
+ --emit gepa/gepa_outputs_glm.jsonl \
43
+ --save-prompt gepa/best_prompt_glm.txt
@@ -0,0 +1,16 @@
1
+ FROM /home/tobi/src/github.com/tobi/qmd/finetune/outputs/sft/gguf/sft-q4_k_m.gguf
2
+
3
+ PARAMETER temperature 0.0
4
+ PARAMETER top_p 1.0
5
+ PARAMETER top_k 0
6
+ PARAMETER repeat_penalty 1.1
7
+ PARAMETER num_ctx 4096
8
+
9
+ TEMPLATE """<|im_start|>system
10
+ You are a helpful assistant.
11
+ <|im_end|>
12
+ <|im_start|>user
13
+ /no_think Expand this search query: {{ .Prompt }}
14
+ <|im_end|>
15
+ <|im_start|>assistant
16
+ """
@@ -0,0 +1,299 @@
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model: Qwen/Qwen3-1.7B
6
+ tags:
7
+ - query-expansion
8
+ - search
9
+ - gguf
10
+ - qwen3
11
+ pipeline_tag: text-generation
12
+ ---
13
+
14
+ # QMD Query Expansion Fine-Tuning
15
+
16
+ Train small language models to expand search queries for [QMD](https://github.com/tobi/qmd)'s hybrid retrieval pipeline.
17
+
18
+ ## What This Does
19
+
20
+ Given a raw search query like `"auth config"`, the trained model produces structured expansions:
21
+
22
+ ```
23
+ hyde: Authentication can be configured by setting the AUTH_SECRET environment variable.
24
+ lex: authentication configuration
25
+ lex: auth settings setup
26
+ vec: how to configure authentication settings
27
+ vec: authentication configuration options
28
+ ```
29
+
30
+ These feed into QMD's three search backends:
31
+ - **`lex:`** lines go to BM25 full-text search (short, keyword-focused)
32
+ - **`vec:`** lines go to vector similarity search (natural language phrases)
33
+ - **`hyde:`** is a hypothetical document passage for embedding-based retrieval ([HyDE](https://arxiv.org/abs/2212.10496) technique)
34
+
35
+ ## Quick Start
36
+
37
+ ### Cloud training via HuggingFace Jobs (no GPU needed)
38
+
39
+ ```bash
40
+ # 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50)
41
+ hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
42
+
43
+ # 2. GRPO: RL refinement on top of SFT (~20 min on A10G, ~$0.50)
44
+ hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
45
+
46
+ # 3. Evaluate against test queries (needs local GPU or use eval job)
47
+ uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
48
+ --sft-model tobil/qmd-query-expansion-1.7B-sft
49
+
50
+ # 4. Convert to GGUF for local deployment (Ollama, llama.cpp)
51
+ uv run convert_gguf.py --size 1.7B
52
+ ```
53
+
54
+ ### Local training (if you have a GPU)
55
+
56
+ ```bash
57
+ uv run train.py sft --config configs/sft.yaml
58
+ uv run train.py grpo --config configs/grpo.yaml
59
+ ```
60
+
61
+ ### Monitoring HF Jobs
62
+
63
+ ```bash
64
+ hf jobs ps # list running jobs
65
+ hf jobs inspect <job-id> # check status
66
+ hf jobs logs <job-id> # stream logs
67
+ hf jobs cancel <job-id> # cancel a job
68
+ ```
69
+
70
+ ## Prompt Format
71
+
72
+ All tools use the same prompt — **Qwen3 chat template with `/no_think`**:
73
+
74
+ ```
75
+ <|im_start|>user
76
+ /no_think Expand this search query: {query}<|im_end|>
77
+ <|im_start|>assistant
78
+ ```
79
+
80
+ The `/no_think` directive suppresses Qwen3's chain-of-thought mode, producing
81
+ direct `lex:/vec:/hyde:` output without `<think>` blocks.
82
+
83
+ ## File Structure
84
+
85
+ ```
86
+ finetune/
87
+ ├── reward.py # Scoring/reward function (single source of truth)
88
+ ├── train.py # Unified SFT + GRPO training (two subcommands)
89
+ ├── eval.py # Generate expansions and score them
90
+ ├── convert_gguf.py # GGUF conversion for Ollama/llama.cpp
91
+ ├── jobs/
92
+ │ ├── sft.py # Self-contained SFT for HuggingFace Jobs
93
+ │ ├── grpo.py # Self-contained GRPO for HuggingFace Jobs
94
+ │ ├── eval.py # Self-contained eval for HuggingFace Jobs
95
+ │ ├── eval_common.py # Shared eval utilities
96
+ │ └── quantize.py # GGUF quantization for HuggingFace Jobs
97
+ ├── configs/
98
+ │ ├── sft.yaml # SFT hyperparameters for Qwen3-1.7B
99
+ │ └── grpo.yaml # GRPO hyperparameters for Qwen3-1.7B
100
+ ├── evals/
101
+ │ └── queries.txt # 31 test queries across 8 categories
102
+ ├── data/
103
+ │ └── qmd_expansion_v2.jsonl # Source training data (1,000 high-quality examples)
104
+ ├── dataset/
105
+ │ ├── generate_data.py # Generate data via Claude API
106
+ │ ├── generate_data_offline.py # Generate from existing HF dataset
107
+ │ ├── prepare_data.py # Format for Qwen3 chat template
108
+ │ └── clean_data.py # Detect technical term misinterpretations
109
+ ├── SCORING.md # Detailed scoring rubric reference
110
+ └── README.md # This file
111
+ ```
112
+
113
+ ## Training Pipeline
114
+
115
+ ### Stage 1: SFT (Supervised Fine-Tuning)
116
+
117
+ Teaches the model the `lex:/vec:/hyde:` output format from labeled examples.
118
+
119
+ | Parameter | Value |
120
+ |-----------|-------|
121
+ | Base model | `Qwen/Qwen3-1.7B` |
122
+ | Method | LoRA (rank 16, alpha 32) |
123
+ | Target modules | All projection layers (q/k/v/o/gate/up/down) |
124
+ | Dataset | ~2,290 examples (train split) |
125
+ | Effective batch size | 16 (4 × 4 gradient accumulation) |
126
+ | Epochs | 5 |
127
+ | Learning rate | 2e-4 (cosine schedule) |
128
+
129
+ ```bash
130
+ uv run train.py sft --config configs/sft.yaml
131
+ uv run train.py sft --config configs/sft.yaml --dry-run # preview config
132
+ ```
133
+
134
+ ### Stage 2: GRPO (Group Relative Policy Optimization)
135
+
136
+ Reinforcement learning on top of the merged SFT weights. The model generates
137
+ multiple expansions per query, they are scored by the reward function, and the
138
+ model is updated to prefer higher-scoring outputs.
139
+
140
+ | Parameter | Value |
141
+ |-----------|-------|
142
+ | Base | Merged SFT checkpoint |
143
+ | Method | LoRA (rank 4, alpha 8) — smaller for RL stability |
144
+ | Target modules | q_proj, v_proj only |
145
+ | Reward | `reward.py` (rule-based, 5 dimensions) |
146
+ | KL beta | 0.04 — prevents drift from SFT checkpoint |
147
+ | Generations per prompt | 4 |
148
+ | Max steps | 200 |
149
+ | Learning rate | 5e-7 |
150
+
151
+ **Important:** `beta > 0` is critical. With `beta=0` the model experiences
152
+ catastrophic drift and scores drop to 0%.
153
+
154
+ ```bash
155
+ uv run train.py grpo --config configs/grpo.yaml
156
+ uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward function
157
+ ```
158
+
159
+ ## Evaluation
160
+
161
+ `eval.py` generates expansions from a model and scores them against test queries:
162
+
163
+ ```bash
164
+ # Evaluate an SFT model
165
+ uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft
166
+
167
+ # Evaluate a GRPO model (needs SFT adapter merged first)
168
+ uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
169
+ --sft-model tobil/qmd-query-expansion-1.7B-sft
170
+
171
+ # Verbose output with deduction details
172
+ uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -v
173
+
174
+ # Save detailed scores to JSON
175
+ uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -o scores.json
176
+
177
+ # Score an existing JSONL file (backwards compat with old run.py output)
178
+ uv run eval.py --score-only evals/results_old.jsonl
179
+ ```
180
+
181
+ ## Reward Function
182
+
183
+ `reward.py` is the single source of truth for scoring. It is used both as the
184
+ GRPO reward signal during training and for evaluation.
185
+
186
+ Five scoring dimensions (max 120 without hyde, 140 with):
187
+
188
+ | Dimension | Points | What It Measures |
189
+ |-----------|--------|------------------|
190
+ | **Format** | 0-30 | Has lex/vec lines, no invalid lines |
191
+ | **Diversity** | 0-30 | Multiple expansion types, diverse content, no query echoes |
192
+ | **HyDE** | 0-20 | Present, 50-200 chars, single line, not repetitive |
193
+ | **Quality** | 0-20 | Lex shorter than vec, natural language, preserves key terms |
194
+ | **Entity** | -45 to +20 | Named entities preserved in lex and vec lines |
195
+ | **Think bonus** | 0-20 | Reward for NOT using `<think>` mode |
196
+
197
+ **Hard failures** (instant 0.0):
198
+ - Chat template leakage (`<|im_start|>`, `<|im_end|>`, etc.)
199
+ - Any line without a valid `lex:`, `vec:`, or `hyde:` prefix
200
+
201
+ ```bash
202
+ # Self-test the reward function
203
+ uv run reward.py
204
+ ```
205
+
206
+ ## GGUF Conversion
207
+
208
+ Merges base + SFT + GRPO adapters into a single model and produces
209
+ quantized GGUF files for deployment:
210
+
211
+ ```bash
212
+ # Use preset for 1.7B
213
+ uv run convert_gguf.py --size 1.7B
214
+
215
+ # Use preset for 4B
216
+ uv run convert_gguf.py --size 4B
217
+
218
+ # Custom models
219
+ uv run convert_gguf.py --base Qwen/Qwen3-1.7B \
220
+ --sft tobil/qmd-query-expansion-1.7B-sft \
221
+ --grpo tobil/qmd-query-expansion-1.7B-grpo \
222
+ --output tobil/qmd-query-expansion-1.7B-gguf
223
+ ```
224
+
225
+ ### Using with Ollama
226
+
227
+ ```bash
228
+ huggingface-cli download tobil/qmd-query-expansion-1.7B-gguf \
229
+ qmd-query-expansion-1.7B-q4_k_m.gguf --local-dir .
230
+
231
+ echo 'FROM ./qmd-query-expansion-1.7B-q4_k_m.gguf' > Modelfile
232
+ ollama create qmd-expand -f Modelfile
233
+ ollama run qmd-expand
234
+ ```
235
+
236
+ ## Data Pipeline
237
+
238
+ The training data (1,000 examples in `data/qmd_expansion_v2.jsonl`) was generated
239
+ from two sources and cleaned for quality. To regenerate:
240
+
241
+ ```bash
242
+ # Generate from existing HuggingFace dataset (bulk, no API needed)
243
+ uv run dataset/generate_data_offline.py
244
+
245
+ # Generate via Claude API (higher quality, needs ANTHROPIC_API_KEY)
246
+ uv run dataset/generate_data.py --count 100
247
+
248
+ # Detect and fix technical term misinterpretations
249
+ uv run dataset/clean_data.py
250
+
251
+ # Format for Qwen3 chat template, add short-query augmentation, split train/val
252
+ uv run dataset/prepare_data.py
253
+ ```
254
+
255
+ ## Architecture Notes
256
+
257
+ The two-stage training approach (SFT → GRPO) is standard for structured-output models:
258
+
259
+ 1. **SFT** establishes format compliance and basic query understanding. It uses
260
+ a large LoRA (rank 16, all projection layers) because it needs to learn a
261
+ new output format from scratch.
262
+
263
+ 2. **GRPO** refines quality within the learned format. It uses a small LoRA
264
+ (rank 4, q/v only) and KL regularization to make incremental improvements
265
+ without losing what SFT taught.
266
+
267
+ The reward function is entirely rule-based (no LLM judge) which makes it fast,
268
+ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric.
269
+
270
+ ## Training Results (Qwen3-1.7B, v2)
271
+
272
+ ### SFT
273
+
274
+ | Metric | Value |
275
+ |--------|-------|
276
+ | Final train loss | 0.472 |
277
+ | Final eval loss | 0.304 |
278
+ | Token accuracy (train) | 97.4% |
279
+ | Token accuracy (eval) | 93.8% |
280
+ | Epochs | 5 |
281
+ | Hardware | A10G (24 GB VRAM) |
282
+
283
+ ### GRPO
284
+
285
+ | Metric | Value |
286
+ |--------|-------|
287
+ | Mean reward | 0.757 |
288
+ | Final loss | 0.0005 |
289
+ | KL divergence | 0.00048 |
290
+ | Mean completion length | ~58 tokens |
291
+ | Training time | ~19 min (200 steps) |
292
+ | Hardware | A10G (24 GB VRAM) |
293
+
294
+ ### Evaluation Scores
295
+
296
+ | Model | Average Score | Excellent (30) |
297
+ |-------|--------------|-----------------|
298
+ | SFT | 92.0% | 30/30 |
299
+ | GRPO | 91.7% | 30/30 |
@@ -0,0 +1,286 @@
1
+ # QMD Query Expansion Scoring
2
+
3
+ ## Goal
4
+
5
+ Transform a random typed query into a great set of retrieval-optimized expansions.
6
+
7
+ **Input:** `"auth config"`
8
+ **Output:**
9
+ ```
10
+ hyde: Authentication can be configured by setting the AUTH_SECRET environment variable and enabling the auth middleware in your application's config file.
11
+ lex: authentication configuration
12
+ lex: auth settings setup
13
+ vec: how to configure authentication settings
14
+ vec: authentication configuration options
15
+ ```
16
+
17
+ ## Output Format
18
+
19
+ | Prefix | Purpose | Required | Count |
20
+ |--------|---------|----------|-------|
21
+ | `lex:` | BM25 keyword variations (shorter, keyword-focused) | Yes | 1-3 |
22
+ | `vec:` | Semantic reformulations (natural language) | Yes | 1-3 |
23
+ | `hyde:` | Hypothetical document passage | Optional | 0-1 |
24
+
25
+ ## Scoring Criteria
26
+
27
+ ### 1. Format Compliance (0-30 points)
28
+
29
+ | Criterion | Points | Deduction |
30
+ |-----------|--------|-----------|
31
+ | Has at least one `lex:` line | +10 | -10 if missing |
32
+ | Has at least one `vec:` line | +10 | -10 if missing |
33
+ | All lines have valid prefix (`lex:`, `vec:`, `hyde:`) | +10 | -5 per invalid line |
34
+ | No garbage/prose outside of prefixed lines | - | -10 if present |
35
+
36
+ ### 2. Diversity & Coverage (0-30 points)
37
+
38
+ | Criterion | Points | Deduction |
39
+ |-----------|--------|-----------|
40
+ | 2+ different types present (lex + vec) | +10 | -10 if only one type |
41
+ | 2+ total expansions | +5 | -5 if only one |
42
+ | Multiple lex: lines are diverse (edit distance > 3) | +5 | -2 per duplicate pair |
43
+ | Multiple vec: lines are diverse (edit distance > 5) | +5 | -2 per duplicate pair |
44
+ | lex/vec not identical to original query | +5 | -5 per line that equals query |
45
+
46
+ ### 3. Hyde Quality (0-20 points, optional bonus)
47
+
48
+ | Criterion | Points | Deduction |
49
+ |-----------|--------|-----------|
50
+ | Hyde present and well-formed | +5 | - |
51
+ | Hyde is concise (50-200 chars) | +5 | -3 if too short, -5 if too long |
52
+ | Hyde has no newlines | +5 | -5 if contains newlines |
53
+ | Hyde has no excessive repetition | +5 | -3 if word repeats 3+ times |
54
+
55
+ ### 4. Content Quality (0-20 points)
56
+
57
+ | Criterion | Points | Deduction |
58
+ |-----------|--------|-----------|
59
+ | Base relevance | +5 | Subjective |
60
+ | Lex lines preserve key terms from query | +5 | -5 if lex is generic |
61
+ | Lex lines are keyword-focused (shorter) | +5 | -2 if lex is longer than vec |
62
+ | Vec lines are natural language (complete phrases) | +5 | -2 if vec is just keywords |
63
+
64
+ ### 5. Named Entity Preservation (0-20 points, CRITICAL)
65
+
66
+ Named entities are proper nouns, brand names, technical terms, and acronyms that MUST appear in lex queries. This prevents generic expansions that lose the specific topic.
67
+
68
+ | Criterion | Points | Deduction |
69
+ |-----------|--------|-----------|
70
+ | All lex lines contain at least one entity | +15 | - |
71
+ | Some lex lines contain entities | +5 | - |
72
+ | NO lex lines contain entities | - | **-30 HEAVY PENALTY** |
73
+ | Generic filler phrases in lex | - | -15 per phrase |
74
+ | Entities also in vec lines | +5 | - |
75
+
76
+ **Named Entity Detection:**
77
+ - All-caps acronyms: `TDS`, `API`, `GPU`, `AWS`
78
+ - Capitalized proper nouns: `React`, `Docker`, `Kubernetes`
79
+ - Technical terms: `node.js`, `C++`, `.NET`
80
+ - CamelCase: `JavaScript`, `TypeScript`
81
+ - Compound names: `TDS motorsports` → both words are entities
82
+
83
+ **Generic Filler Phrases (BANNED in lex):**
84
+ - "find information about"
85
+ - "search for", "look up"
86
+ - "get information", "learn about"
87
+ - "details about", "guide to"
88
+
89
+ **Examples:**
90
+
91
+ | Query | Bad Lex (Score: 0.30) | Good Lex (Score: 1.00) |
92
+ |-------|----------------------|------------------------|
93
+ | `who is TDS motorsports` | `lex: find information about` | `lex: TDS motorsports history` |
94
+ | | `lex: company details` | `lex: TDS motorsports founders` |
95
+ | `how to use React hooks` | `lex: programming tutorial` | `lex: React hooks tutorial` |
96
+ | | `lex: how to code` | `lex: useEffect useState hooks` |
97
+
98
+ **Key Rule**: If a query mentions a specific entity (brand, product, technology), EVERY lex line should include that entity or a direct variation of it.
99
+
100
+ ## Score Calculation
101
+
102
+ ```
103
+ Total Score = Format + Diversity + Hyde + Quality + Entity
104
+ Max Score = 120 (100 without hyde)
105
+ Normalized = Total / Max (0.0 - 1.0)
106
+ ```
107
+
108
+ **Rating:**
109
+ - 0.80-1.00: Excellent
110
+ - 0.60-0.79: Good
111
+ - 0.40-0.59: Acceptable
112
+ - 0.20-0.39: Poor
113
+ - 0.00-0.19: Failed
114
+
115
+ **Note:** Entity score can go negative, heavily penalizing outputs that miss named entities.
116
+
117
+ ## Examples
118
+
119
+ ### Good Example (Score: 85/100)
120
+
121
+ **Query:** `"react hooks"`
122
+
123
+ **Output:**
124
+ ```
125
+ hyde: React Hooks allow you to use state and lifecycle features in functional components without writing a class.
126
+ lex: react hooks tutorial
127
+ lex: usestate useeffect
128
+ vec: how to use react hooks in functional components
129
+ vec: react hooks best practices guide
130
+ ```
131
+
132
+ **Scoring:**
133
+ - Format: 30/30 (all prefixes valid)
134
+ - Diversity: 28/30 (2 types, 4 expansions, good diversity)
135
+ - Hyde: 17/20 (present, good length, no repetition)
136
+ - Quality: 10/20 (relevant, good structure)
137
+
138
+ ### Bad Example (Score: 15/100)
139
+
140
+ **Query:** `"auth"`
141
+
142
+ **Output:**
143
+ ```
144
+ auth is an important concept that relates to authentication.
145
+ The answer should be in Chinese.
146
+ The answer should be in Chinese.
147
+ ```
148
+
149
+ **Scoring:**
150
+ - Format: 0/30 (no valid prefixes)
151
+ - Diversity: 0/30 (no valid expansions)
152
+ - Hyde: 0/20 (N/A)
153
+ - Quality: 15/20 (some relevance but wrong format)
154
+
155
+ ### Medium Example (Score: 55/100)
156
+
157
+ **Query:** `"docker networking"`
158
+
159
+ **Output:**
160
+ ```
161
+ hyde: Docker networking is an important concept. Docker networking is used for container communication. Docker networking configuration is essential.
162
+ lex: docker networking
163
+ vec: docker networking
164
+ ```
165
+
166
+ **Scoring:**
167
+ - Format: 30/30 (valid prefixes)
168
+ - Diversity: 10/30 (lex=vec=query, no diversity)
169
+ - Hyde: 5/20 (too repetitive - "docker networking" 3x)
170
+ - Quality: 10/20 (relevant but low effort)
171
+
172
+ ## Heuristics
173
+
174
+ ### Repetition Detection
175
+
176
+ ```python
177
+ def word_repetition_score(text):
178
+ words = text.lower().split()
179
+ counts = Counter(words)
180
+ # Deduct for words appearing 3+ times (excluding stopwords)
181
+ stopwords = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or'}
182
+ repeated = sum(1 for w, c in counts.items() if c >= 3 and w not in stopwords)
183
+ return max(0, 5 - repeated * 2)
184
+ ```
185
+
186
+ ### Diversity Check (Simple)
187
+
188
+ ```python
189
+ def is_diverse(a, b, min_distance=3):
190
+ """Check if two strings are sufficiently different."""
191
+ a, b = a.lower().strip(), b.lower().strip()
192
+ if a == b:
193
+ return False
194
+ # Simple: check if one is not a substring of the other
195
+ if a in b or b in a:
196
+ return False
197
+ # Check edit distance (simplified)
198
+ return len(set(a.split()) ^ set(b.split())) >= min_distance
199
+ ```
200
+
201
+ ### Query Echo Detection
202
+
203
+ ```python
204
+ def echoes_query(expansion, query):
205
+ """Check if expansion is just echoing the query."""
206
+ exp = expansion.lower().strip()
207
+ q = query.lower().strip()
208
+ return exp == q or exp in q or q in exp
209
+ ```
210
+
211
+ ### Named Entity Extraction
212
+
213
+ ```python
214
+ KEY_TERM_STOPWORDS = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of',
215
+ 'and', 'or', 'with', 'my', 'your', 'do', 'does', 'can', 'i', 'me', 'we',
216
+ 'who', 'where', 'when', 'why', 'which', 'find', 'get', 'show', 'tell'}
217
+
218
+ def extract_named_entities(query: str) -> set:
219
+ """Extract named entities using simple heuristics."""
220
+ entities = set()
221
+ words = query.split()
222
+ prev_was_entity = False
223
+
224
+ for i, word in enumerate(words):
225
+ clean = word.strip('.,!?:;()[]"\'')
226
+ if not clean:
227
+ prev_was_entity = False
228
+ continue
229
+
230
+ is_entity = False
231
+
232
+ # All-caps acronyms: TDS, API, GPU
233
+ if clean.isupper() and len(clean) >= 2:
234
+ entities.add(clean.lower())
235
+ is_entity = True
236
+ # Capitalized proper nouns (not first word)
237
+ elif i > 0 and clean[0].isupper() and clean.lower() not in KEY_TERM_STOPWORDS:
238
+ entities.add(clean.lower())
239
+ is_entity = True
240
+ # Technical terms: node.js, C++
241
+ elif any(c in clean for c in '.+-#@') and len(clean) >= 2:
242
+ entities.add(clean.lower())
243
+ is_entity = True
244
+ # CamelCase: JavaScript
245
+ elif len(clean) > 1 and any(c.isupper() for c in clean[1:]) and clean[0].isupper():
246
+ entities.add(clean.lower())
247
+ is_entity = True
248
+ # Word following an entity (compound names: TDS motorsports)
249
+ elif prev_was_entity and clean.lower() not in KEY_TERM_STOPWORDS:
250
+ entities.add(clean.lower())
251
+ is_entity = True
252
+
253
+ prev_was_entity = is_entity
254
+
255
+ return entities
256
+ ```
257
+
258
+ ### Generic Phrase Detection
259
+
260
+ ```python
261
+ GENERIC_LEX_PHRASES = {
262
+ 'find information about', 'search for', 'look up', 'get information',
263
+ 'learn about', 'information on', 'details about', 'find out about',
264
+ 'what is', 'how to', 'guide to', 'help with'
265
+ }
266
+
267
+ def lex_is_generic(lex_line: str) -> bool:
268
+ """Check if lex line is a useless generic filler."""
269
+ lex_lower = lex_line.lower().strip()
270
+ for phrase in GENERIC_LEX_PHRASES:
271
+ if phrase in lex_lower:
272
+ # Check if there's specific content beyond the generic phrase
273
+ remaining = lex_lower
274
+ for word in phrase.split():
275
+ remaining = remaining.replace(word, '', 1).strip()
276
+ if len(remaining) < 3: # Nothing specific left
277
+ return True
278
+ return False
279
+ ```
280
+
281
+ ## Training Data Requirements
282
+
283
+ 1. **EOM tokens**: Ensure training examples end with proper end-of-message tokens
284
+ 2. **Diverse examples**: Include varied query types (short, long, technical, casual)
285
+ 3. **Quality hyde**: Hyde passages should be informative, not template-y
286
+ 4. **No repetition**: Avoid "This is important. This is very important." patterns
@@ -0,0 +1,17 @@
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ enable_cpu_affinity: false
6
+ gpu_ids: all
7
+ machine_rank: 0
8
+ main_training_function: main
9
+ mixed_precision: bf16
10
+ num_machines: 1
11
+ num_processes: 4
12
+ rdzv_backend: static
13
+ same_network: true
14
+ tpu_env: []
15
+ tpu_use_cluster: false
16
+ tpu_use_sudo: false
17
+ use_cpu: false