qmdr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/.claude-plugin/marketplace.json +29 -0
  2. package/.env.example +85 -0
  3. package/.gitattributes +3 -0
  4. package/.github/workflows/release.yml +77 -0
  5. package/AI-SETUP.md +466 -0
  6. package/LICENSE +22 -0
  7. package/README.md +78 -0
  8. package/bun.lock +637 -0
  9. package/docs/README-zh.md +78 -0
  10. package/docs/refactor-checklist.md +54 -0
  11. package/docs/setup-openclaw.md +139 -0
  12. package/example-index.yml +33 -0
  13. package/finetune/BALANCED_DISTRIBUTION.md +157 -0
  14. package/finetune/DATA_IMPROVEMENTS.md +218 -0
  15. package/finetune/Justfile +43 -0
  16. package/finetune/Modelfile +16 -0
  17. package/finetune/README.md +299 -0
  18. package/finetune/SCORING.md +286 -0
  19. package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
  20. package/finetune/configs/grpo.yaml +49 -0
  21. package/finetune/configs/sft.yaml +42 -0
  22. package/finetune/configs/sft_local.yaml +40 -0
  23. package/finetune/convert_gguf.py +221 -0
  24. package/finetune/data/best_glm_prompt.txt +17 -0
  25. package/finetune/data/gepa_generated.prompts.json +32 -0
  26. package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
  27. package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
  28. package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
  29. package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
  30. package/finetune/data/qmd_expansion_locations.jsonl +64 -0
  31. package/finetune/data/qmd_expansion_people.jsonl +46 -0
  32. package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
  33. package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
  34. package/finetune/data/qmd_only_sampled.jsonl +399 -0
  35. package/finetune/dataset/analyze_data.py +369 -0
  36. package/finetune/dataset/clean_data.py +906 -0
  37. package/finetune/dataset/generate_balanced.py +823 -0
  38. package/finetune/dataset/generate_data.py +714 -0
  39. package/finetune/dataset/generate_data_offline.py +206 -0
  40. package/finetune/dataset/generate_diverse.py +441 -0
  41. package/finetune/dataset/generate_ollama.py +326 -0
  42. package/finetune/dataset/prepare_data.py +197 -0
  43. package/finetune/dataset/schema.py +73 -0
  44. package/finetune/dataset/score_data.py +115 -0
  45. package/finetune/dataset/validate_schema.py +104 -0
  46. package/finetune/eval.py +196 -0
  47. package/finetune/evals/queries.txt +56 -0
  48. package/finetune/gepa/__init__.py +1 -0
  49. package/finetune/gepa/best_prompt.txt +31 -0
  50. package/finetune/gepa/best_prompt_glm.txt +1 -0
  51. package/finetune/gepa/dspy_gepa.py +204 -0
  52. package/finetune/gepa/example.py +117 -0
  53. package/finetune/gepa/generate.py +129 -0
  54. package/finetune/gepa/gepa_outputs.jsonl +10 -0
  55. package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
  56. package/finetune/gepa/model.json +19 -0
  57. package/finetune/gepa/optimizer.py +70 -0
  58. package/finetune/gepa/score.py +84 -0
  59. package/finetune/jobs/eval.py +490 -0
  60. package/finetune/jobs/eval_common.py +354 -0
  61. package/finetune/jobs/eval_verbose.py +113 -0
  62. package/finetune/jobs/grpo.py +141 -0
  63. package/finetune/jobs/quantize.py +244 -0
  64. package/finetune/jobs/sft.py +121 -0
  65. package/finetune/pyproject.toml +23 -0
  66. package/finetune/reward.py +610 -0
  67. package/finetune/train.py +611 -0
  68. package/finetune/uv.lock +4070 -0
  69. package/flake.lock +61 -0
  70. package/flake.nix +83 -0
  71. package/migrate-schema.ts +162 -0
  72. package/package.json +56 -0
  73. package/skills/qmdr/SKILL.md +172 -0
  74. package/skills/qmdr/references/mcp-setup.md +88 -0
  75. package/src/app/commands/collection.ts +55 -0
  76. package/src/app/commands/context.ts +82 -0
  77. package/src/app/commands/document.ts +46 -0
  78. package/src/app/commands/maintenance.ts +60 -0
  79. package/src/app/commands/search.ts +45 -0
  80. package/src/app/ports/llm.ts +13 -0
  81. package/src/app/services/llm-service.ts +145 -0
  82. package/src/cli.test.ts +963 -0
  83. package/src/collections.ts +390 -0
  84. package/src/eval.test.ts +412 -0
  85. package/src/formatter.ts +427 -0
  86. package/src/llm.test.ts +559 -0
  87. package/src/llm.ts +1990 -0
  88. package/src/mcp.test.ts +889 -0
  89. package/src/mcp.ts +626 -0
  90. package/src/qmd.ts +3330 -0
  91. package/src/store/collections.ts +7 -0
  92. package/src/store/context.ts +10 -0
  93. package/src/store/db.ts +5 -0
  94. package/src/store/documents.ts +26 -0
  95. package/src/store/maintenance.ts +15 -0
  96. package/src/store/path.ts +13 -0
  97. package/src/store/search.ts +10 -0
  98. package/src/store-paths.test.ts +395 -0
  99. package/src/store.test.ts +2483 -0
  100. package/src/store.ts +2813 -0
  101. package/test/eval-harness.ts +223 -0
  102. package/tsconfig.json +29 -0
@@ -0,0 +1,906 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Quality Reviewer for Query Expansion Training Dataset
4
+
5
+ This script identifies and flags/fixes semantic errors where technical terms
6
+ are misunderstood. For example:
7
+ - "gem find" expanded as "mineral hunt" instead of "ruby gem search"
8
+ - "yarn spin" expanded as "wool twist" instead of "yarn package manager"
9
+
10
+ The script uses contextual analysis to detect when technical terms
11
+ are likely being used in a programming context vs. their everyday meaning.
12
+ """
13
+
14
+ import json
15
+ import re
16
+ from pathlib import Path
17
+ from dataclasses import dataclass, field
18
+ from typing import Optional
19
+ from collections import defaultdict
20
+
21
+ from dataset.schema import (
22
+ normalize_output_items,
23
+ output_items_to_text,
24
+ parse_output_text,
25
+ )
26
+
27
+
28
+ @dataclass
29
+ class TechnicalTerm:
30
+ """Definition of a technical term that might be misunderstood."""
31
+
32
+ term: str # The ambiguous term (e.g., "liquid", "gem", "yarn")
33
+ context_indicators: list[str] # Words that suggest tech context
34
+ wrong_expansions: list[str] # Patterns that indicate wrong interpretation
35
+ correct_domain: str # What domain this belongs to when technical
36
+ correct_lex: list[str] # Correct lex expansions
37
+ correct_vec: list[str] # Correct vec expansions
38
+
39
+
40
+ # Known technical terms that are commonly misunderstood
41
+ KNOWN_TECHNICAL_TERMS = [
42
+ TechnicalTerm(
43
+ term="liquid",
44
+ context_indicators=["shopify", "template", "filter", "tag", "theme", "jekyll"],
45
+ wrong_expansions=["fluid", "water", "pour", "drink", "beverage", "h2o", "wet"],
46
+ correct_domain="Shopify/Jekyll templating language",
47
+ correct_lex=["shopify template syntax", "liquid template filter"],
48
+ correct_vec=[
49
+ "shopify liquid templating language",
50
+ "liquid template engine filters",
51
+ ],
52
+ ),
53
+ TechnicalTerm(
54
+ term="gem",
55
+ context_indicators=[
56
+ "ruby",
57
+ "bundler",
58
+ "install",
59
+ "gemfile",
60
+ "rails",
61
+ "require",
62
+ ],
63
+ wrong_expansions=[
64
+ "mineral",
65
+ "crystal",
66
+ "jewel",
67
+ "stone",
68
+ "diamond",
69
+ "jewelry",
70
+ "precious",
71
+ ],
72
+ correct_domain="Ruby package manager",
73
+ correct_lex=["ruby gem package", "gem install command"],
74
+ correct_vec=["ruby gem package manager", "rubygems library installation"],
75
+ ),
76
+ TechnicalTerm(
77
+ term="yarn",
78
+ context_indicators=[
79
+ "npm",
80
+ "package",
81
+ "install",
82
+ "node",
83
+ "javascript",
84
+ "react",
85
+ "webpack",
86
+ ],
87
+ wrong_expansions=[
88
+ "thread",
89
+ "wool",
90
+ "knit",
91
+ "spin",
92
+ "textile",
93
+ "fabric",
94
+ "sew",
95
+ "twist",
96
+ ],
97
+ correct_domain="JavaScript package manager",
98
+ correct_lex=["yarn package manager", "yarn install dependencies"],
99
+ correct_vec=["yarn javascript package manager", "yarn npm alternative"],
100
+ ),
101
+ TechnicalTerm(
102
+ term="hook",
103
+ context_indicators=[
104
+ "react",
105
+ "use",
106
+ "state",
107
+ "effect",
108
+ "component",
109
+ "callback",
110
+ "git",
111
+ ],
112
+ wrong_expansions=["fish", "fishing", "bait", "catch", "hang", "pirate"],
113
+ correct_domain="React hooks or Git hooks",
114
+ correct_lex=["react hooks api", "usestate useeffect"],
115
+ correct_vec=[
116
+ "react hooks state management",
117
+ "react functional component hooks",
118
+ ],
119
+ ),
120
+ TechnicalTerm(
121
+ term="container",
122
+ context_indicators=[
123
+ "docker",
124
+ "kubernetes",
125
+ "k8s",
126
+ "image",
127
+ "orchestration",
128
+ "pod",
129
+ ],
130
+ wrong_expansions=[
131
+ "box",
132
+ "storage",
133
+ "shipping",
134
+ "cargo",
135
+ "tupperware",
136
+ "jar",
137
+ "vessel",
138
+ ],
139
+ correct_domain="Docker/Kubernetes containers",
140
+ correct_lex=["docker container", "container image"],
141
+ correct_vec=[
142
+ "docker container virtualization",
143
+ "container orchestration platform",
144
+ ],
145
+ ),
146
+ TechnicalTerm(
147
+ term="branch",
148
+ context_indicators=[
149
+ "git",
150
+ "merge",
151
+ "checkout",
152
+ "commit",
153
+ "main",
154
+ "master",
155
+ "repo",
156
+ ],
157
+ wrong_expansions=["tree", "limb", "wood", "leaf", "twig", "forest"],
158
+ correct_domain="Git version control",
159
+ correct_lex=["git branch", "git checkout branch"],
160
+ correct_vec=["git branch version control", "git branching workflow"],
161
+ ),
162
+ TechnicalTerm(
163
+ term="decorator",
164
+ context_indicators=["python", "@", "function", "wrapper", "class", "def"],
165
+ wrong_expansions=[
166
+ "interior",
167
+ "design",
168
+ "paint",
169
+ "furniture",
170
+ "decor",
171
+ "ornament",
172
+ ],
173
+ correct_domain="Python decorators",
174
+ correct_lex=["python decorator function", "@decorator syntax"],
175
+ correct_vec=["python function decorators", "python decorator pattern"],
176
+ ),
177
+ TechnicalTerm(
178
+ term="bean",
179
+ context_indicators=[
180
+ "java",
181
+ "spring",
182
+ "injection",
183
+ "dependency",
184
+ "servlet",
185
+ "ejb",
186
+ ],
187
+ wrong_expansions=["coffee", "food", "vegetable", "legume", "plant", "soy"],
188
+ correct_domain="Java Beans / Spring Beans",
189
+ correct_lex=["java bean class", "spring bean injection"],
190
+ correct_vec=["java enterprise beans", "spring dependency injection beans"],
191
+ ),
192
+ TechnicalTerm(
193
+ term="shell",
194
+ context_indicators=[
195
+ "bash",
196
+ "script",
197
+ "terminal",
198
+ "command",
199
+ "linux",
200
+ "unix",
201
+ "zsh",
202
+ ],
203
+ wrong_expansions=["seashell", "ocean", "beach", "clam", "oyster", "egg"],
204
+ correct_domain="Unix/Linux shell scripting",
205
+ correct_lex=["bash shell script", "shell command"],
206
+ correct_vec=["unix shell scripting", "bash command line shell"],
207
+ ),
208
+ TechnicalTerm(
209
+ term="rust",
210
+ context_indicators=[
211
+ "cargo",
212
+ "crate",
213
+ "ownership",
214
+ "borrow",
215
+ "lifetime",
216
+ "unsafe",
217
+ ],
218
+ wrong_expansions=["oxidation", "metal", "corrosion", "decay", "iron", "orange"],
219
+ correct_domain="Rust programming language",
220
+ correct_lex=["rust programming language", "rust cargo package"],
221
+ correct_vec=["rust systems programming", "rust memory safety"],
222
+ ),
223
+ TechnicalTerm(
224
+ term="go",
225
+ context_indicators=[
226
+ "golang",
227
+ "goroutine",
228
+ "channel",
229
+ "defer",
230
+ "gofmt",
231
+ "module",
232
+ ],
233
+ wrong_expansions=[
234
+ "travel",
235
+ "move",
236
+ "walk",
237
+ "game",
238
+ "board game",
239
+ "leave",
240
+ "depart",
241
+ ],
242
+ correct_domain="Go programming language",
243
+ correct_lex=["golang programming", "go language syntax"],
244
+ correct_vec=["go programming language", "golang concurrent programming"],
245
+ ),
246
+ TechnicalTerm(
247
+ term="swift",
248
+ context_indicators=["ios", "xcode", "apple", "uikit", "swiftui", "cocoa"],
249
+ wrong_expansions=["fast", "quick", "bird", "speed", "rapid", "taylor"],
250
+ correct_domain="Swift programming language",
251
+ correct_lex=["swift ios development", "swift programming language"],
252
+ correct_vec=["swift apple programming language", "swift ios app development"],
253
+ ),
254
+ TechnicalTerm(
255
+ term="pod",
256
+ context_indicators=[
257
+ "kubernetes",
258
+ "k8s",
259
+ "deployment",
260
+ "service",
261
+ "cluster",
262
+ "node",
263
+ ],
264
+ wrong_expansions=["pea", "seed", "plant", "vegetable", "legume", "whale"],
265
+ correct_domain="Kubernetes pods",
266
+ correct_lex=["kubernetes pod", "k8s pod deployment"],
267
+ correct_vec=["kubernetes pod container group", "k8s pod orchestration"],
268
+ ),
269
+ TechnicalTerm(
270
+ term="redis",
271
+ context_indicators=[
272
+ "cache",
273
+ "database",
274
+ "key-value",
275
+ "memory",
276
+ "pub/sub",
277
+ "queue",
278
+ ],
279
+ wrong_expansions=[], # "redis" doesn't have common wrong meanings
280
+ correct_domain="Redis in-memory database",
281
+ correct_lex=["redis cache", "redis database"],
282
+ correct_vec=["redis in-memory data store", "redis caching solution"],
283
+ ),
284
+ TechnicalTerm(
285
+ term="kafka",
286
+ context_indicators=[
287
+ "message",
288
+ "stream",
289
+ "queue",
290
+ "broker",
291
+ "topic",
292
+ "producer",
293
+ "consumer",
294
+ ],
295
+ wrong_expansions=[
296
+ "franz",
297
+ "author",
298
+ "writer",
299
+ "novel",
300
+ "metamorphosis",
301
+ "literature",
302
+ ],
303
+ correct_domain="Apache Kafka message queue",
304
+ correct_lex=["apache kafka", "kafka message broker"],
305
+ correct_vec=["apache kafka streaming platform", "kafka message queue"],
306
+ ),
307
+ TechnicalTerm(
308
+ term="elastic",
309
+ context_indicators=[
310
+ "elasticsearch",
311
+ "search",
312
+ "index",
313
+ "kibana",
314
+ "logstash",
315
+ "query",
316
+ ],
317
+ wrong_expansions=["stretch", "rubber", "flexible", "band", "bouncy"],
318
+ correct_domain="Elasticsearch",
319
+ correct_lex=["elasticsearch", "elastic search index"],
320
+ correct_vec=["elasticsearch full-text search", "elastic stack"],
321
+ ),
322
+ TechnicalTerm(
323
+ term="spark",
324
+ context_indicators=["apache", "hadoop", "data", "rdd", "dataframe", "pyspark"],
325
+ wrong_expansions=["fire", "ignite", "flame", "plug", "electricity"],
326
+ correct_domain="Apache Spark",
327
+ correct_lex=["apache spark", "spark data processing"],
328
+ correct_vec=["apache spark big data processing", "spark cluster computing"],
329
+ ),
330
+ TechnicalTerm(
331
+ term="flask",
332
+ context_indicators=["python", "web", "route", "api", "jinja", "werkzeug"],
333
+ wrong_expansions=[
334
+ "bottle",
335
+ "container",
336
+ "lab",
337
+ "chemistry",
338
+ "drink",
339
+ "thermos",
340
+ ],
341
+ correct_domain="Flask web framework",
342
+ correct_lex=["flask python web framework", "flask api"],
343
+ correct_vec=["flask python web development", "flask microframework"],
344
+ ),
345
+ TechnicalTerm(
346
+ term="django",
347
+ context_indicators=["python", "web", "orm", "model", "view", "template"],
348
+ wrong_expansions=["jazz", "music", "reinhardt", "guitar", "movie", "western"],
349
+ correct_domain="Django web framework",
350
+ correct_lex=["django python framework", "django web development"],
351
+ correct_vec=["django python web framework", "django orm models"],
352
+ ),
353
+ TechnicalTerm(
354
+ term="rails",
355
+ context_indicators=[
356
+ "ruby",
357
+ "gem",
358
+ "activerecord",
359
+ "model",
360
+ "controller",
361
+ "migration",
362
+ ],
363
+ wrong_expansions=["train", "track", "railroad", "railway", "metal"],
364
+ correct_domain="Ruby on Rails",
365
+ correct_lex=["ruby on rails", "rails web framework"],
366
+ correct_vec=["ruby on rails framework", "rails mvc architecture"],
367
+ ),
368
+ TechnicalTerm(
369
+ term="node",
370
+ context_indicators=[
371
+ "javascript",
372
+ "npm",
373
+ "express",
374
+ "async",
375
+ "require",
376
+ "module",
377
+ ],
378
+ wrong_expansions=["lump", "knot", "bump", "growth", "junction"],
379
+ correct_domain="Node.js",
380
+ correct_lex=["node.js javascript", "nodejs runtime"],
381
+ correct_vec=["node.js javascript runtime", "nodejs server-side javascript"],
382
+ ),
383
+ TechnicalTerm(
384
+ term="maven",
385
+ context_indicators=[
386
+ "java",
387
+ "pom",
388
+ "dependency",
389
+ "build",
390
+ "artifact",
391
+ "repository",
392
+ ],
393
+ wrong_expansions=["expert", "specialist", "connoisseur"],
394
+ correct_domain="Apache Maven",
395
+ correct_lex=["apache maven", "maven build tool"],
396
+ correct_vec=["apache maven java build", "maven dependency management"],
397
+ ),
398
+ TechnicalTerm(
399
+ term="gradle",
400
+ context_indicators=["java", "kotlin", "android", "build", "groovy", "task"],
401
+ wrong_expansions=["grade", "slope", "hill", "incline"],
402
+ correct_domain="Gradle build tool",
403
+ correct_lex=["gradle build tool", "gradle android"],
404
+ correct_vec=["gradle java build automation", "gradle kotlin dsl"],
405
+ ),
406
+ TechnicalTerm(
407
+ term="ant",
408
+ context_indicators=["java", "build", "xml", "target", "task"],
409
+ wrong_expansions=["insect", "bug", "colony", "hill", "picnic"],
410
+ correct_domain="Apache Ant build tool",
411
+ correct_lex=["apache ant", "ant build xml"],
412
+ correct_vec=["apache ant java build", "ant build automation"],
413
+ ),
414
+ ]
415
+
416
+
417
+ @dataclass
418
+ class Issue:
419
+ """Represents an issue found in a dataset example."""
420
+
421
+ line_number: int
422
+ input_text: str
423
+ output_text: str
424
+ issue_type: str
425
+ technical_term: str
426
+ wrong_expansion_found: str
427
+ suggested_fix: Optional[str] = None
428
+
429
+
430
+ @dataclass
431
+ class AnalysisResult:
432
+ """Results of analyzing the dataset."""
433
+
434
+ total_examples: int = 0
435
+ issues_found: list[Issue] = field(default_factory=list)
436
+ examples_with_correct_tech_terms: list[tuple[int, str]] = field(
437
+ default_factory=list
438
+ )
439
+ term_statistics: dict = field(default_factory=lambda: defaultdict(int))
440
+
441
+
442
+ def check_for_wrong_expansion(output_text: str, term: TechnicalTerm) -> Optional[str]:
443
+ """Check if the output contains wrong expansions for a technical term."""
444
+ output_lower = output_text.lower()
445
+ for wrong in term.wrong_expansions:
446
+ if wrong.lower() in output_lower:
447
+ return wrong
448
+ return None
449
+
450
+
451
+ def has_tech_context(input_text: str, term: TechnicalTerm) -> bool:
452
+ """Check if the input has indicators of a technical context."""
453
+ input_lower = input_text.lower()
454
+ for indicator in term.context_indicators:
455
+ if indicator.lower() in input_lower:
456
+ return True
457
+ return False
458
+
459
+
460
+ def is_likely_tech_query(input_text: str) -> bool:
461
+ """
462
+ Heuristic to determine if a short query is likely tech-related.
463
+ Short queries like "gem find" or "yarn spin" are ambiguous.
464
+ """
465
+ tech_patterns = [
466
+ r"\b(install|config|setup|build|run|debug|test|deploy|compile)\b",
467
+ r"\b(api|cli|sdk|lib|pkg|npm|pip|cargo)\b",
468
+ r"\b(func|class|method|var|const|let|def)\b",
469
+ r"\b(http|https|url|port|host|server|client)\b",
470
+ r"\b(json|xml|yaml|csv|sql|html|css|js)\b",
471
+ ]
472
+ input_lower = input_text.lower()
473
+ for pattern in tech_patterns:
474
+ if re.search(pattern, input_lower):
475
+ return True
476
+ return False
477
+
478
+
479
+ def has_non_tech_context(input_text: str, term: TechnicalTerm) -> bool:
480
+ """
481
+ Check if the input clearly indicates a non-technical context.
482
+ This helps avoid false positives for words like "car rust", "yarn spin", etc.
483
+ """
484
+ input_lower = input_text.lower()
485
+ term_lower = term.term.lower()
486
+
487
+ # Define non-tech context indicators for each ambiguous term
488
+ non_tech_contexts = {
489
+ "rust": [
490
+ "car",
491
+ "metal",
492
+ "iron",
493
+ "steel",
494
+ "corrosion",
495
+ "prevention",
496
+ "remove",
497
+ "body",
498
+ ],
499
+ "gem": [
500
+ "gemstone",
501
+ "jewelry",
502
+ "jewel",
503
+ "diamond",
504
+ "precious",
505
+ "stone",
506
+ "cut",
507
+ "shop",
508
+ "buy",
509
+ "wear",
510
+ ],
511
+ "yarn": [
512
+ "knit",
513
+ "crochet",
514
+ "spin",
515
+ "wool",
516
+ "thread",
517
+ "textile",
518
+ "fabric",
519
+ "sew",
520
+ "weave",
521
+ ],
522
+ "hook": ["fishing", "crochet", "hang", "coat", "wall", "ceiling"],
523
+ "container": [
524
+ "storage",
525
+ "plastic",
526
+ "food",
527
+ "shipping",
528
+ "cargo",
529
+ "kitchen",
530
+ "box",
531
+ ],
532
+ "branch": ["tree", "bank", "library", "store", "office", "organization"],
533
+ "decorator": [
534
+ "interior",
535
+ "home",
536
+ "room",
537
+ "house",
538
+ "design",
539
+ "party",
540
+ "cake",
541
+ "wedding",
542
+ ],
543
+ "bean": [
544
+ "coffee",
545
+ "soy",
546
+ "kidney",
547
+ "black",
548
+ "green",
549
+ "garden",
550
+ "cooking",
551
+ "food",
552
+ "plant",
553
+ "grow",
554
+ ],
555
+ "shell": [
556
+ "sea",
557
+ "beach",
558
+ "egg",
559
+ "nut",
560
+ "turtle",
561
+ "snail",
562
+ "crab",
563
+ "clam",
564
+ "oyster",
565
+ ],
566
+ "spark": ["plug", "fire", "ignite", "car", "engine", "electric", "romance"],
567
+ "go": ["travel", "vacation", "trip", "walk", "run", "leave", "visit", "tour"],
568
+ "swift": ["taylor", "concert", "music", "singer", "speed", "fast", "bird"],
569
+ "pod": ["pea", "whale", "orca", "dolphin", "vegetable", "seed", "plant"],
570
+ "ant": ["insect", "colony", "fire", "carpenter", "pest", "bug", "picnic"],
571
+ "node": ["lymph", "medical", "body", "tree", "network point"],
572
+ "rails": ["train", "railroad", "railway", "track", "transit", "fence"],
573
+ "flask": ["lab", "chemistry", "drink", "hip", "thermos", "bottle", "water"],
574
+ "django": [
575
+ "jazz",
576
+ "music",
577
+ "reinhardt",
578
+ "guitar",
579
+ "movie",
580
+ "western",
581
+ "unchained",
582
+ ],
583
+ "maven": ["expert", "connoisseur", "specialist", "guru"],
584
+ "gradle": ["grade", "school", "slope"],
585
+ "kafka": [
586
+ "franz",
587
+ "author",
588
+ "novel",
589
+ "metamorphosis",
590
+ "literature",
591
+ "writer",
592
+ "book",
593
+ ],
594
+ "elastic": ["band", "rubber", "stretch", "flexible", "waist", "fabric"],
595
+ }
596
+
597
+ if term_lower in non_tech_contexts:
598
+ for context_word in non_tech_contexts[term_lower]:
599
+ if context_word.lower() in input_lower:
600
+ return True
601
+
602
+ return False
603
+
604
+
605
+ def analyze_example(line_num: int, input_text: str, output_text: str) -> list[Issue]:
606
+ """Analyze a single example for potential issues."""
607
+ issues = []
608
+ input_lower = input_text.lower()
609
+
610
+ for term in KNOWN_TECHNICAL_TERMS:
611
+ term_lower = term.term.lower()
612
+
613
+ # Check if the input contains this technical term
614
+ if term_lower not in input_lower:
615
+ continue
616
+
617
+ # Check if output has wrong expansion
618
+ wrong_expansion = check_for_wrong_expansion(output_text, term)
619
+ if wrong_expansion is None:
620
+ continue
621
+
622
+ # Skip if the context clearly indicates non-technical usage
623
+ if has_non_tech_context(input_text, term):
624
+ continue
625
+
626
+ # Determine if this is likely a technical context
627
+ is_tech = has_tech_context(input_text, term) or is_likely_tech_query(input_text)
628
+
629
+ # For very short inputs that contain ONLY the tech term (like "gem find"),
630
+ # these are ambiguous and could be tech-related
631
+ word_count = len(input_text.split())
632
+ words = [w.lower() for w in input_text.split()]
633
+
634
+ # Only flag if it's clearly a tech context OR a very short query
635
+ # where the term appears prominently (e.g., "gem find", "yarn add")
636
+ if is_tech:
637
+ # Create suggested fix for definite tech issues
638
+ suggested_output = f"lex: {term.correct_lex[0]}\nlex: {term.correct_lex[1] if len(term.correct_lex) > 1 else term.correct_lex[0]}\nvec: {term.correct_vec[0]}\nvec: {term.correct_vec[1] if len(term.correct_vec) > 1 else term.correct_vec[0]}\nhyde: {term.correct_domain} is a concept that provides functionality for software development."
639
+
640
+ issue = Issue(
641
+ line_number=line_num,
642
+ input_text=input_text,
643
+ output_text=output_text[:200] + "..."
644
+ if len(output_text) > 200
645
+ else output_text,
646
+ issue_type="wrong_tech_expansion",
647
+ technical_term=term.term,
648
+ wrong_expansion_found=wrong_expansion,
649
+ suggested_fix=suggested_output,
650
+ )
651
+ issues.append(issue)
652
+ elif word_count <= 2 and term_lower in words:
653
+ # Very short query with the term as a primary word - truly ambiguous
654
+ issue = Issue(
655
+ line_number=line_num,
656
+ input_text=input_text,
657
+ output_text=output_text[:200] + "..."
658
+ if len(output_text) > 200
659
+ else output_text,
660
+ issue_type="ambiguous_term",
661
+ technical_term=term.term,
662
+ wrong_expansion_found=wrong_expansion,
663
+ suggested_fix=None,
664
+ )
665
+ issues.append(issue)
666
+
667
+ return issues
668
+
669
+
670
+ def analyze_dataset(filepath: Path) -> AnalysisResult:
671
+ """Analyze the entire dataset for issues."""
672
+ result = AnalysisResult()
673
+
674
+ with open(filepath, "r", encoding="utf-8") as f:
675
+ for line_num, line in enumerate(f, 1):
676
+ line = line.strip()
677
+ if not line:
678
+ continue
679
+
680
+ try:
681
+ example = json.loads(line)
682
+ input_text = example.get("query", "") or example.get("input", "")
683
+ output_raw = example.get("output", [])
684
+ if isinstance(output_raw, str):
685
+ output_items = normalize_output_items(parse_output_text(output_raw))
686
+ else:
687
+ output_items = normalize_output_items(output_raw)
688
+ output_text = output_items_to_text(output_items)
689
+
690
+ result.total_examples += 1
691
+
692
+ # Analyze for issues
693
+ issues = analyze_example(line_num, input_text, output_text)
694
+ result.issues_found.extend(issues)
695
+
696
+ # Track term statistics
697
+ for term in KNOWN_TECHNICAL_TERMS:
698
+ if term.term.lower() in input_text.lower():
699
+ result.term_statistics[term.term] += 1
700
+
701
+ except json.JSONDecodeError as e:
702
+ print(f"Warning: Could not parse line {line_num}: {e}")
703
+
704
+ return result
705
+
706
+
707
+ def fix_example(example: dict, issues: list[Issue]) -> Optional[dict]:
708
+ """
709
+ Attempt to fix an example based on identified issues.
710
+ Returns None if no fix is needed or possible.
711
+ """
712
+ # Only fix examples with definite tech context issues
713
+ tech_issues = [
714
+ i for i in issues if i.issue_type == "wrong_tech_expansion" and i.suggested_fix
715
+ ]
716
+
717
+ if not tech_issues:
718
+ return None
719
+
720
+ # Use the first tech issue's fix (they should be similar)
721
+ issue = tech_issues[0]
722
+ if not issue.suggested_fix:
723
+ return None
724
+
725
+ fixed = example.copy()
726
+ fixed_output_items = normalize_output_items(parse_output_text(issue.suggested_fix))
727
+ fixed["output"] = fixed_output_items
728
+ fixed["_fixed"] = True
729
+ original_items = example.get("output", [])
730
+ if isinstance(original_items, str):
731
+ original_items = normalize_output_items(parse_output_text(original_items))
732
+ fixed["_original_output"] = output_items_to_text(original_items)
733
+ fixed["_fix_reason"] = (
734
+ f"Technical term '{issue.technical_term}' was incorrectly expanded as '{issue.wrong_expansion_found}'"
735
+ )
736
+
737
+ return fixed
738
+
739
+
740
+ def generate_report(result: AnalysisResult) -> str:
741
+ """Generate a human-readable report of the analysis."""
742
+ lines = []
743
+ lines.append("=" * 70)
744
+ lines.append("QUERY EXPANSION DATASET QUALITY REPORT")
745
+ lines.append("=" * 70)
746
+ lines.append("")
747
+ lines.append(f"Total examples analyzed: {result.total_examples}")
748
+ lines.append(f"Issues found: {len(result.issues_found)}")
749
+ lines.append("")
750
+
751
+ # Group issues by type
752
+ by_type = defaultdict(list)
753
+ for issue in result.issues_found:
754
+ by_type[issue.issue_type].append(issue)
755
+
756
+ lines.append("-" * 70)
757
+ lines.append("ISSUES BY TYPE:")
758
+ lines.append("-" * 70)
759
+
760
+ for issue_type, issues in by_type.items():
761
+ lines.append(f"\n{issue_type.upper()}: {len(issues)} issues")
762
+ lines.append("-" * 40)
763
+
764
+ # Show up to 10 examples per type
765
+ for issue in issues[:10]:
766
+ lines.append(f"\n Line {issue.line_number}:")
767
+ lines.append(f" Input: {issue.input_text}")
768
+ lines.append(f" Technical term: '{issue.technical_term}'")
769
+ lines.append(f" Wrong expansion found: '{issue.wrong_expansion_found}'")
770
+ if issue.suggested_fix:
771
+ lines.append(f" Suggested fix available: Yes")
772
+
773
+ if len(issues) > 10:
774
+ lines.append(f"\n ... and {len(issues) - 10} more")
775
+
776
+ # Term statistics
777
+ lines.append("\n" + "-" * 70)
778
+ lines.append("TECHNICAL TERM OCCURRENCES IN DATASET:")
779
+ lines.append("-" * 70)
780
+
781
+ for term, count in sorted(result.term_statistics.items(), key=lambda x: -x[1]):
782
+ if count > 0:
783
+ lines.append(f" {term}: {count} occurrences")
784
+
785
+ lines.append("\n" + "=" * 70)
786
+
787
+ return "\n".join(lines)
788
+
789
+
790
+ def save_cleaned_dataset(filepath: Path, output_path: Path, result: AnalysisResult):
791
+ """Save a cleaned version of the dataset."""
792
+ issues_by_line = defaultdict(list)
793
+ for issue in result.issues_found:
794
+ issues_by_line[issue.line_number].append(issue)
795
+
796
+ fixed_count = 0
797
+ flagged_count = 0
798
+
799
+ with (
800
+ open(filepath, "r", encoding="utf-8") as f_in,
801
+ open(output_path, "w", encoding="utf-8") as f_out,
802
+ ):
803
+ for line_num, line in enumerate(f_in, 1):
804
+ line = line.strip()
805
+ if not line:
806
+ continue
807
+
808
+ try:
809
+ example = json.loads(line)
810
+ if "query" not in example and "input" in example:
811
+ example["query"] = example.pop("input")
812
+
813
+ output_raw = example.get("output", [])
814
+ if isinstance(output_raw, str):
815
+ example["output"] = normalize_output_items(
816
+ parse_output_text(output_raw)
817
+ )
818
+ else:
819
+ example["output"] = normalize_output_items(output_raw)
820
+
821
+ if line_num in issues_by_line:
822
+ issues = issues_by_line[line_num]
823
+ fixed = fix_example(example, issues)
824
+
825
+ if fixed:
826
+ f_out.write(json.dumps(fixed) + "\n")
827
+ fixed_count += 1
828
+ else:
829
+ # Flag but don't fix ambiguous cases
830
+ example["_flagged"] = True
831
+ example["_flag_reason"] = (
832
+ f"Ambiguous term '{issues[0].technical_term}' may need review"
833
+ )
834
+ f_out.write(json.dumps(example) + "\n")
835
+ flagged_count += 1
836
+ else:
837
+ f_out.write(json.dumps(example) + "\n")
838
+
839
+ except json.JSONDecodeError:
840
+ # Keep problematic lines as-is
841
+ f_out.write(line + "\n")
842
+
843
+ return fixed_count, flagged_count
844
+
845
+
846
+ def main():
847
+ """Main entry point."""
848
+ # Paths
849
+ script_dir = Path(__file__).parent
850
+ input_path = script_dir / "data" / "qmd_expansion.jsonl"
851
+ output_path = script_dir / "data" / "qmd_expansion_cleaned.jsonl"
852
+ report_path = script_dir / "data" / "quality_report.txt"
853
+
854
+ print(f"Analyzing dataset: {input_path}")
855
+ print("-" * 50)
856
+
857
+ if not input_path.exists():
858
+ print(f"Error: Input file not found: {input_path}")
859
+ return 1
860
+
861
+ # Analyze the dataset
862
+ result = analyze_dataset(input_path)
863
+
864
+ # Generate and print report
865
+ report = generate_report(result)
866
+ print(report)
867
+
868
+ # Save report to file
869
+ with open(report_path, "w", encoding="utf-8") as f:
870
+ f.write(report)
871
+ print(f"\nReport saved to: {report_path}")
872
+
873
+ # Save cleaned dataset
874
+ fixed_count, flagged_count = save_cleaned_dataset(input_path, output_path, result)
875
+
876
+ print(f"\nCleaned dataset saved to: {output_path}")
877
+ print(f" - Examples fixed: {fixed_count}")
878
+ print(f" - Examples flagged for review: {flagged_count}")
879
+ print(
880
+ f" - Examples unchanged: {result.total_examples - fixed_count - flagged_count}"
881
+ )
882
+
883
+ # Summary statistics
884
+ print("\n" + "=" * 50)
885
+ print("SUMMARY")
886
+ print("=" * 50)
887
+ print(f"Total examples: {result.total_examples}")
888
+ print(f"Total issues found: {len(result.issues_found)}")
889
+
890
+ tech_issues = [
891
+ i for i in result.issues_found if i.issue_type == "wrong_tech_expansion"
892
+ ]
893
+ ambig_issues = [i for i in result.issues_found if i.issue_type == "ambiguous_term"]
894
+
895
+ print(f" - Definite tech term errors: {len(tech_issues)}")
896
+ print(f" - Ambiguous terms needing review: {len(ambig_issues)}")
897
+
898
+ if len(result.issues_found) > 0:
899
+ error_rate = len(result.issues_found) / result.total_examples * 100
900
+ print(f"\nError rate: {error_rate:.2f}%")
901
+
902
+ return 0
903
+
904
+
905
+ if __name__ == "__main__":
906
+ exit(main())