npm - qmdr - Versions diffs - 1.0.0 - Mend

qmdr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/.claude-plugin/marketplace.json +29 -0
package/.env.example +85 -0
package/.gitattributes +3 -0
package/.github/workflows/release.yml +77 -0
package/AI-SETUP.md +466 -0
package/LICENSE +22 -0
package/README.md +78 -0
package/bun.lock +637 -0
package/docs/README-zh.md +78 -0
package/docs/refactor-checklist.md +54 -0
package/docs/setup-openclaw.md +139 -0
package/example-index.yml +33 -0
package/finetune/BALANCED_DISTRIBUTION.md +157 -0
package/finetune/DATA_IMPROVEMENTS.md +218 -0
package/finetune/Justfile +43 -0
package/finetune/Modelfile +16 -0
package/finetune/README.md +299 -0
package/finetune/SCORING.md +286 -0
package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
package/finetune/configs/grpo.yaml +49 -0
package/finetune/configs/sft.yaml +42 -0
package/finetune/configs/sft_local.yaml +40 -0
package/finetune/convert_gguf.py +221 -0
package/finetune/data/best_glm_prompt.txt +17 -0
package/finetune/data/gepa_generated.prompts.json +32 -0
package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
package/finetune/data/qmd_expansion_locations.jsonl +64 -0
package/finetune/data/qmd_expansion_people.jsonl +46 -0
package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
package/finetune/data/qmd_only_sampled.jsonl +399 -0
package/finetune/dataset/analyze_data.py +369 -0
package/finetune/dataset/clean_data.py +906 -0
package/finetune/dataset/generate_balanced.py +823 -0
package/finetune/dataset/generate_data.py +714 -0
package/finetune/dataset/generate_data_offline.py +206 -0
package/finetune/dataset/generate_diverse.py +441 -0
package/finetune/dataset/generate_ollama.py +326 -0
package/finetune/dataset/prepare_data.py +197 -0
package/finetune/dataset/schema.py +73 -0
package/finetune/dataset/score_data.py +115 -0
package/finetune/dataset/validate_schema.py +104 -0
package/finetune/eval.py +196 -0
package/finetune/evals/queries.txt +56 -0
package/finetune/gepa/__init__.py +1 -0
package/finetune/gepa/best_prompt.txt +31 -0
package/finetune/gepa/best_prompt_glm.txt +1 -0
package/finetune/gepa/dspy_gepa.py +204 -0
package/finetune/gepa/example.py +117 -0
package/finetune/gepa/generate.py +129 -0
package/finetune/gepa/gepa_outputs.jsonl +10 -0
package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
package/finetune/gepa/model.json +19 -0
package/finetune/gepa/optimizer.py +70 -0
package/finetune/gepa/score.py +84 -0
package/finetune/jobs/eval.py +490 -0
package/finetune/jobs/eval_common.py +354 -0
package/finetune/jobs/eval_verbose.py +113 -0
package/finetune/jobs/grpo.py +141 -0
package/finetune/jobs/quantize.py +244 -0
package/finetune/jobs/sft.py +121 -0
package/finetune/pyproject.toml +23 -0
package/finetune/reward.py +610 -0
package/finetune/train.py +611 -0
package/finetune/uv.lock +4070 -0
package/flake.lock +61 -0
package/flake.nix +83 -0
package/migrate-schema.ts +162 -0
package/package.json +56 -0
package/skills/qmdr/SKILL.md +172 -0
package/skills/qmdr/references/mcp-setup.md +88 -0
package/src/app/commands/collection.ts +55 -0
package/src/app/commands/context.ts +82 -0
package/src/app/commands/document.ts +46 -0
package/src/app/commands/maintenance.ts +60 -0
package/src/app/commands/search.ts +45 -0
package/src/app/ports/llm.ts +13 -0
package/src/app/services/llm-service.ts +145 -0
package/src/cli.test.ts +963 -0
package/src/collections.ts +390 -0
package/src/eval.test.ts +412 -0
package/src/formatter.ts +427 -0
package/src/llm.test.ts +559 -0
package/src/llm.ts +1990 -0
package/src/mcp.test.ts +889 -0
package/src/mcp.ts +626 -0
package/src/qmd.ts +3330 -0
package/src/store/collections.ts +7 -0
package/src/store/context.ts +10 -0
package/src/store/db.ts +5 -0
package/src/store/documents.ts +26 -0
package/src/store/maintenance.ts +15 -0
package/src/store/path.ts +13 -0
package/src/store/search.ts +10 -0
package/src/store-paths.test.ts +395 -0
package/src/store.test.ts +2483 -0
package/src/store.ts +2813 -0
package/test/eval-harness.ts +223 -0
package/tsconfig.json +29 -0

package/finetune/dataset/clean_data.py ADDED Viewed

@@ -0,0 +1,906 @@
+#!/usr/bin/env python3
+"""
+Data Quality Reviewer for Query Expansion Training Dataset
+This script identifies and flags/fixes semantic errors where technical terms
+are misunderstood. For example:
+- "gem find" expanded as "mineral hunt" instead of "ruby gem search"
+- "yarn spin" expanded as "wool twist" instead of "yarn package manager"
+The script uses contextual analysis to detect when technical terms
+are likely being used in a programming context vs. their everyday meaning.
+"""
+import json
+import re
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Optional
+from collections import defaultdict
+from dataset.schema import (
+    normalize_output_items,
+    output_items_to_text,
+    parse_output_text,
+)
+@dataclass
+class TechnicalTerm:
+    """Definition of a technical term that might be misunderstood."""
+    term: str  # The ambiguous term (e.g., "liquid", "gem", "yarn")
+    context_indicators: list[str]  # Words that suggest tech context
+    wrong_expansions: list[str]  # Patterns that indicate wrong interpretation
+    correct_domain: str  # What domain this belongs to when technical
+    correct_lex: list[str]  # Correct lex expansions
+    correct_vec: list[str]  # Correct vec expansions
+# Known technical terms that are commonly misunderstood
+KNOWN_TECHNICAL_TERMS = [
+    TechnicalTerm(
+        term="liquid",
+        context_indicators=["shopify", "template", "filter", "tag", "theme", "jekyll"],
+        wrong_expansions=["fluid", "water", "pour", "drink", "beverage", "h2o", "wet"],
+        correct_domain="Shopify/Jekyll templating language",
+        correct_lex=["shopify template syntax", "liquid template filter"],
+        correct_vec=[
+            "shopify liquid templating language",
+            "liquid template engine filters",
+        ],
+    ),
+    TechnicalTerm(
+        term="gem",
+        context_indicators=[
+            "ruby",
+            "bundler",
+            "install",
+            "gemfile",
+            "rails",
+            "require",
+        ],
+        wrong_expansions=[
+            "mineral",
+            "crystal",
+            "jewel",
+            "stone",
+            "diamond",
+            "jewelry",
+            "precious",
+        ],
+        correct_domain="Ruby package manager",
+        correct_lex=["ruby gem package", "gem install command"],
+        correct_vec=["ruby gem package manager", "rubygems library installation"],
+    ),
+    TechnicalTerm(
+        term="yarn",
+        context_indicators=[
+            "npm",
+            "package",
+            "install",
+            "node",
+            "javascript",
+            "react",
+            "webpack",
+        ],
+        wrong_expansions=[
+            "thread",
+            "wool",
+            "knit",
+            "spin",
+            "textile",
+            "fabric",
+            "sew",
+            "twist",
+        ],
+        correct_domain="JavaScript package manager",
+        correct_lex=["yarn package manager", "yarn install dependencies"],
+        correct_vec=["yarn javascript package manager", "yarn npm alternative"],
+    ),
+    TechnicalTerm(
+        term="hook",
+        context_indicators=[
+            "react",
+            "use",
+            "state",
+            "effect",
+            "component",
+            "callback",
+            "git",
+        ],
+        wrong_expansions=["fish", "fishing", "bait", "catch", "hang", "pirate"],
+        correct_domain="React hooks or Git hooks",
+        correct_lex=["react hooks api", "usestate useeffect"],
+        correct_vec=[
+            "react hooks state management",
+            "react functional component hooks",
+        ],
+    ),
+    TechnicalTerm(
+        term="container",
+        context_indicators=[
+            "docker",
+            "kubernetes",
+            "k8s",
+            "image",
+            "orchestration",
+            "pod",
+        ],
+        wrong_expansions=[
+            "box",
+            "storage",
+            "shipping",
+            "cargo",
+            "tupperware",
+            "jar",
+            "vessel",
+        ],
+        correct_domain="Docker/Kubernetes containers",
+        correct_lex=["docker container", "container image"],
+        correct_vec=[
+            "docker container virtualization",
+            "container orchestration platform",
+        ],
+    ),
+    TechnicalTerm(
+        term="branch",
+        context_indicators=[
+            "git",
+            "merge",
+            "checkout",
+            "commit",
+            "main",
+            "master",
+            "repo",
+        ],
+        wrong_expansions=["tree", "limb", "wood", "leaf", "twig", "forest"],
+        correct_domain="Git version control",
+        correct_lex=["git branch", "git checkout branch"],
+        correct_vec=["git branch version control", "git branching workflow"],
+    ),
+    TechnicalTerm(
+        term="decorator",
+        context_indicators=["python", "@", "function", "wrapper", "class", "def"],
+        wrong_expansions=[
+            "interior",
+            "design",
+            "paint",
+            "furniture",
+            "decor",
+            "ornament",
+        ],
+        correct_domain="Python decorators",
+        correct_lex=["python decorator function", "@decorator syntax"],
+        correct_vec=["python function decorators", "python decorator pattern"],
+    ),
+    TechnicalTerm(
+        term="bean",
+        context_indicators=[
+            "java",
+            "spring",
+            "injection",
+            "dependency",
+            "servlet",
+            "ejb",
+        ],
+        wrong_expansions=["coffee", "food", "vegetable", "legume", "plant", "soy"],
+        correct_domain="Java Beans / Spring Beans",
+        correct_lex=["java bean class", "spring bean injection"],
+        correct_vec=["java enterprise beans", "spring dependency injection beans"],
+    ),
+    TechnicalTerm(
+        term="shell",
+        context_indicators=[
+            "bash",
+            "script",
+            "terminal",
+            "command",
+            "linux",
+            "unix",
+            "zsh",
+        ],
+        wrong_expansions=["seashell", "ocean", "beach", "clam", "oyster", "egg"],
+        correct_domain="Unix/Linux shell scripting",
+        correct_lex=["bash shell script", "shell command"],
+        correct_vec=["unix shell scripting", "bash command line shell"],
+    ),
+    TechnicalTerm(
+        term="rust",
+        context_indicators=[
+            "cargo",
+            "crate",
+            "ownership",
+            "borrow",
+            "lifetime",
+            "unsafe",
+        ],
+        wrong_expansions=["oxidation", "metal", "corrosion", "decay", "iron", "orange"],
+        correct_domain="Rust programming language",
+        correct_lex=["rust programming language", "rust cargo package"],
+        correct_vec=["rust systems programming", "rust memory safety"],
+    ),
+    TechnicalTerm(
+        term="go",
+        context_indicators=[
+            "golang",
+            "goroutine",
+            "channel",
+            "defer",
+            "gofmt",
+            "module",
+        ],
+        wrong_expansions=[
+            "travel",
+            "move",
+            "walk",
+            "game",
+            "board game",
+            "leave",
+            "depart",
+        ],
+        correct_domain="Go programming language",
+        correct_lex=["golang programming", "go language syntax"],
+        correct_vec=["go programming language", "golang concurrent programming"],
+    ),
+    TechnicalTerm(
+        term="swift",
+        context_indicators=["ios", "xcode", "apple", "uikit", "swiftui", "cocoa"],
+        wrong_expansions=["fast", "quick", "bird", "speed", "rapid", "taylor"],
+        correct_domain="Swift programming language",
+        correct_lex=["swift ios development", "swift programming language"],
+        correct_vec=["swift apple programming language", "swift ios app development"],
+    ),
+    TechnicalTerm(
+        term="pod",
+        context_indicators=[
+            "kubernetes",
+            "k8s",
+            "deployment",
+            "service",
+            "cluster",
+            "node",
+        ],
+        wrong_expansions=["pea", "seed", "plant", "vegetable", "legume", "whale"],
+        correct_domain="Kubernetes pods",
+        correct_lex=["kubernetes pod", "k8s pod deployment"],
+        correct_vec=["kubernetes pod container group", "k8s pod orchestration"],
+    ),
+    TechnicalTerm(
+        term="redis",
+        context_indicators=[
+            "cache",
+            "database",
+            "key-value",
+            "memory",
+            "pub/sub",
+            "queue",
+        ],
+        wrong_expansions=[],  # "redis" doesn't have common wrong meanings
+        correct_domain="Redis in-memory database",
+        correct_lex=["redis cache", "redis database"],
+        correct_vec=["redis in-memory data store", "redis caching solution"],
+    ),
+    TechnicalTerm(
+        term="kafka",
+        context_indicators=[
+            "message",
+            "stream",
+            "queue",
+            "broker",
+            "topic",
+            "producer",
+            "consumer",
+        ],
+        wrong_expansions=[
+            "franz",
+            "author",
+            "writer",
+            "novel",
+            "metamorphosis",
+            "literature",
+        ],
+        correct_domain="Apache Kafka message queue",
+        correct_lex=["apache kafka", "kafka message broker"],
+        correct_vec=["apache kafka streaming platform", "kafka message queue"],
+    ),
+    TechnicalTerm(
+        term="elastic",
+        context_indicators=[
+            "elasticsearch",
+            "search",
+            "index",
+            "kibana",
+            "logstash",
+            "query",
+        ],
+        wrong_expansions=["stretch", "rubber", "flexible", "band", "bouncy"],
+        correct_domain="Elasticsearch",
+        correct_lex=["elasticsearch", "elastic search index"],
+        correct_vec=["elasticsearch full-text search", "elastic stack"],
+    ),
+    TechnicalTerm(
+        term="spark",
+        context_indicators=["apache", "hadoop", "data", "rdd", "dataframe", "pyspark"],
+        wrong_expansions=["fire", "ignite", "flame", "plug", "electricity"],
+        correct_domain="Apache Spark",
+        correct_lex=["apache spark", "spark data processing"],
+        correct_vec=["apache spark big data processing", "spark cluster computing"],
+    ),
+    TechnicalTerm(
+        term="flask",
+        context_indicators=["python", "web", "route", "api", "jinja", "werkzeug"],
+        wrong_expansions=[
+            "bottle",
+            "container",
+            "lab",
+            "chemistry",
+            "drink",
+            "thermos",
+        ],
+        correct_domain="Flask web framework",
+        correct_lex=["flask python web framework", "flask api"],
+        correct_vec=["flask python web development", "flask microframework"],
+    ),
+    TechnicalTerm(
+        term="django",
+        context_indicators=["python", "web", "orm", "model", "view", "template"],
+        wrong_expansions=["jazz", "music", "reinhardt", "guitar", "movie", "western"],
+        correct_domain="Django web framework",
+        correct_lex=["django python framework", "django web development"],
+        correct_vec=["django python web framework", "django orm models"],
+    ),
+    TechnicalTerm(
+        term="rails",
+        context_indicators=[
+            "ruby",
+            "gem",
+            "activerecord",
+            "model",
+            "controller",
+            "migration",
+        ],
+        wrong_expansions=["train", "track", "railroad", "railway", "metal"],
+        correct_domain="Ruby on Rails",
+        correct_lex=["ruby on rails", "rails web framework"],
+        correct_vec=["ruby on rails framework", "rails mvc architecture"],
+    ),
+    TechnicalTerm(
+        term="node",
+        context_indicators=[
+            "javascript",
+            "npm",
+            "express",
+            "async",
+            "require",
+            "module",
+        ],
+        wrong_expansions=["lump", "knot", "bump", "growth", "junction"],
+        correct_domain="Node.js",
+        correct_lex=["node.js javascript", "nodejs runtime"],
+        correct_vec=["node.js javascript runtime", "nodejs server-side javascript"],
+    ),
+    TechnicalTerm(
+        term="maven",
+        context_indicators=[
+            "java",
+            "pom",
+            "dependency",
+            "build",
+            "artifact",
+            "repository",
+        ],
+        wrong_expansions=["expert", "specialist", "connoisseur"],
+        correct_domain="Apache Maven",
+        correct_lex=["apache maven", "maven build tool"],
+        correct_vec=["apache maven java build", "maven dependency management"],
+    ),
+    TechnicalTerm(
+        term="gradle",
+        context_indicators=["java", "kotlin", "android", "build", "groovy", "task"],
+        wrong_expansions=["grade", "slope", "hill", "incline"],
+        correct_domain="Gradle build tool",
+        correct_lex=["gradle build tool", "gradle android"],
+        correct_vec=["gradle java build automation", "gradle kotlin dsl"],
+    ),
+    TechnicalTerm(
+        term="ant",
+        context_indicators=["java", "build", "xml", "target", "task"],
+        wrong_expansions=["insect", "bug", "colony", "hill", "picnic"],
+        correct_domain="Apache Ant build tool",
+        correct_lex=["apache ant", "ant build xml"],
+        correct_vec=["apache ant java build", "ant build automation"],
+    ),
+]
+@dataclass
+class Issue:
+    """Represents an issue found in a dataset example."""
+    line_number: int
+    input_text: str
+    output_text: str
+    issue_type: str
+    technical_term: str
+    wrong_expansion_found: str
+    suggested_fix: Optional[str] = None
+@dataclass
+class AnalysisResult:
+    """Results of analyzing the dataset."""
+    total_examples: int = 0
+    issues_found: list[Issue] = field(default_factory=list)
+    examples_with_correct_tech_terms: list[tuple[int, str]] = field(
+        default_factory=list
+    )
+    term_statistics: dict = field(default_factory=lambda: defaultdict(int))
+def check_for_wrong_expansion(output_text: str, term: TechnicalTerm) -> Optional[str]:
+    """Check if the output contains wrong expansions for a technical term."""
+    output_lower = output_text.lower()
+    for wrong in term.wrong_expansions:
+        if wrong.lower() in output_lower:
+            return wrong
+    return None
+def has_tech_context(input_text: str, term: TechnicalTerm) -> bool:
+    """Check if the input has indicators of a technical context."""
+    input_lower = input_text.lower()
+    for indicator in term.context_indicators:
+        if indicator.lower() in input_lower:
+            return True
+    return False
+def is_likely_tech_query(input_text: str) -> bool:
+    """
+    Heuristic to determine if a short query is likely tech-related.
+    Short queries like "gem find" or "yarn spin" are ambiguous.
+    """
+    tech_patterns = [
+        r"\b(install|config|setup|build|run|debug|test|deploy|compile)\b",
+        r"\b(api|cli|sdk|lib|pkg|npm|pip|cargo)\b",
+        r"\b(func|class|method|var|const|let|def)\b",
+        r"\b(http|https|url|port|host|server|client)\b",
+        r"\b(json|xml|yaml|csv|sql|html|css|js)\b",
+    ]
+    input_lower = input_text.lower()
+    for pattern in tech_patterns:
+        if re.search(pattern, input_lower):
+            return True
+    return False
+def has_non_tech_context(input_text: str, term: TechnicalTerm) -> bool:
+    """
+    Check if the input clearly indicates a non-technical context.
+    This helps avoid false positives for words like "car rust", "yarn spin", etc.
+    """
+    input_lower = input_text.lower()
+    term_lower = term.term.lower()
+    # Define non-tech context indicators for each ambiguous term
+    non_tech_contexts = {
+        "rust": [
+            "car",
+            "metal",
+            "iron",
+            "steel",
+            "corrosion",
+            "prevention",
+            "remove",
+            "body",
+        ],
+        "gem": [
+            "gemstone",
+            "jewelry",
+            "jewel",
+            "diamond",
+            "precious",
+            "stone",
+            "cut",
+            "shop",
+            "buy",
+            "wear",
+        ],
+        "yarn": [
+            "knit",
+            "crochet",
+            "spin",
+            "wool",
+            "thread",
+            "textile",
+            "fabric",
+            "sew",
+            "weave",
+        ],
+        "hook": ["fishing", "crochet", "hang", "coat", "wall", "ceiling"],
+        "container": [
+            "storage",
+            "plastic",
+            "food",
+            "shipping",
+            "cargo",
+            "kitchen",
+            "box",
+        ],
+        "branch": ["tree", "bank", "library", "store", "office", "organization"],
+        "decorator": [
+            "interior",
+            "home",
+            "room",
+            "house",
+            "design",
+            "party",
+            "cake",
+            "wedding",
+        ],
+        "bean": [
+            "coffee",
+            "soy",
+            "kidney",
+            "black",
+            "green",
+            "garden",
+            "cooking",
+            "food",
+            "plant",
+            "grow",
+        ],
+        "shell": [
+            "sea",
+            "beach",
+            "egg",
+            "nut",
+            "turtle",
+            "snail",
+            "crab",
+            "clam",
+            "oyster",
+        ],
+        "spark": ["plug", "fire", "ignite", "car", "engine", "electric", "romance"],
+        "go": ["travel", "vacation", "trip", "walk", "run", "leave", "visit", "tour"],
+        "swift": ["taylor", "concert", "music", "singer", "speed", "fast", "bird"],
+        "pod": ["pea", "whale", "orca", "dolphin", "vegetable", "seed", "plant"],
+        "ant": ["insect", "colony", "fire", "carpenter", "pest", "bug", "picnic"],
+        "node": ["lymph", "medical", "body", "tree", "network point"],
+        "rails": ["train", "railroad", "railway", "track", "transit", "fence"],
+        "flask": ["lab", "chemistry", "drink", "hip", "thermos", "bottle", "water"],
+        "django": [
+            "jazz",
+            "music",
+            "reinhardt",
+            "guitar",
+            "movie",
+            "western",
+            "unchained",
+        ],
+        "maven": ["expert", "connoisseur", "specialist", "guru"],
+        "gradle": ["grade", "school", "slope"],
+        "kafka": [
+            "franz",
+            "author",
+            "novel",
+            "metamorphosis",
+            "literature",
+            "writer",
+            "book",
+        ],
+        "elastic": ["band", "rubber", "stretch", "flexible", "waist", "fabric"],
+    }
+    if term_lower in non_tech_contexts:
+        for context_word in non_tech_contexts[term_lower]:
+            if context_word.lower() in input_lower:
+                return True
+    return False
+def analyze_example(line_num: int, input_text: str, output_text: str) -> list[Issue]:
+    """Analyze a single example for potential issues."""
+    issues = []
+    input_lower = input_text.lower()
+    for term in KNOWN_TECHNICAL_TERMS:
+        term_lower = term.term.lower()
+        # Check if the input contains this technical term
+        if term_lower not in input_lower:
+            continue
+        # Check if output has wrong expansion
+        wrong_expansion = check_for_wrong_expansion(output_text, term)
+        if wrong_expansion is None:
+            continue
+        # Skip if the context clearly indicates non-technical usage
+        if has_non_tech_context(input_text, term):
+            continue
+        # Determine if this is likely a technical context
+        is_tech = has_tech_context(input_text, term) or is_likely_tech_query(input_text)
+        # For very short inputs that contain ONLY the tech term (like "gem find"),
+        # these are ambiguous and could be tech-related
+        word_count = len(input_text.split())
+        words = [w.lower() for w in input_text.split()]
+        # Only flag if it's clearly a tech context OR a very short query
+        # where the term appears prominently (e.g., "gem find", "yarn add")
+        if is_tech:
+            # Create suggested fix for definite tech issues
+            suggested_output = f"lex: {term.correct_lex[0]}\nlex: {term.correct_lex[1] if len(term.correct_lex) > 1 else term.correct_lex[0]}\nvec: {term.correct_vec[0]}\nvec: {term.correct_vec[1] if len(term.correct_vec) > 1 else term.correct_vec[0]}\nhyde: {term.correct_domain} is a concept that provides functionality for software development."
+            issue = Issue(
+                line_number=line_num,
+                input_text=input_text,
+                output_text=output_text[:200] + "..."
+                if len(output_text) > 200
+                else output_text,
+                issue_type="wrong_tech_expansion",
+                technical_term=term.term,
+                wrong_expansion_found=wrong_expansion,
+                suggested_fix=suggested_output,
+            )
+            issues.append(issue)
+        elif word_count <= 2 and term_lower in words:
+            # Very short query with the term as a primary word - truly ambiguous
+            issue = Issue(
+                line_number=line_num,
+                input_text=input_text,
+                output_text=output_text[:200] + "..."
+                if len(output_text) > 200
+                else output_text,
+                issue_type="ambiguous_term",
+                technical_term=term.term,
+                wrong_expansion_found=wrong_expansion,
+                suggested_fix=None,
+            )
+            issues.append(issue)
+    return issues
+def analyze_dataset(filepath: Path) -> AnalysisResult:
+    """Analyze the entire dataset for issues."""
+    result = AnalysisResult()
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                example = json.loads(line)
+                input_text = example.get("query", "") or example.get("input", "")
+                output_raw = example.get("output", [])
+                if isinstance(output_raw, str):
+                    output_items = normalize_output_items(parse_output_text(output_raw))
+                else:
+                    output_items = normalize_output_items(output_raw)
+                output_text = output_items_to_text(output_items)
+                result.total_examples += 1
+                # Analyze for issues
+                issues = analyze_example(line_num, input_text, output_text)
+                result.issues_found.extend(issues)
+                # Track term statistics
+                for term in KNOWN_TECHNICAL_TERMS:
+                    if term.term.lower() in input_text.lower():
+                        result.term_statistics[term.term] += 1
+            except json.JSONDecodeError as e:
+                print(f"Warning: Could not parse line {line_num}: {e}")
+    return result
+def fix_example(example: dict, issues: list[Issue]) -> Optional[dict]:
+    """
+    Attempt to fix an example based on identified issues.
+    Returns None if no fix is needed or possible.
+    """
+    # Only fix examples with definite tech context issues
+    tech_issues = [
+        i for i in issues if i.issue_type == "wrong_tech_expansion" and i.suggested_fix
+    ]
+    if not tech_issues:
+        return None
+    # Use the first tech issue's fix (they should be similar)
+    issue = tech_issues[0]
+    if not issue.suggested_fix:
+        return None
+    fixed = example.copy()
+    fixed_output_items = normalize_output_items(parse_output_text(issue.suggested_fix))
+    fixed["output"] = fixed_output_items
+    fixed["_fixed"] = True
+    original_items = example.get("output", [])
+    if isinstance(original_items, str):
+        original_items = normalize_output_items(parse_output_text(original_items))
+    fixed["_original_output"] = output_items_to_text(original_items)
+    fixed["_fix_reason"] = (
+        f"Technical term '{issue.technical_term}' was incorrectly expanded as '{issue.wrong_expansion_found}'"
+    )
+    return fixed
+def generate_report(result: AnalysisResult) -> str:
+    """Generate a human-readable report of the analysis."""
+    lines = []
+    lines.append("=" * 70)
+    lines.append("QUERY EXPANSION DATASET QUALITY REPORT")
+    lines.append("=" * 70)
+    lines.append("")
+    lines.append(f"Total examples analyzed: {result.total_examples}")
+    lines.append(f"Issues found: {len(result.issues_found)}")
+    lines.append("")
+    # Group issues by type
+    by_type = defaultdict(list)
+    for issue in result.issues_found:
+        by_type[issue.issue_type].append(issue)
+    lines.append("-" * 70)
+    lines.append("ISSUES BY TYPE:")
+    lines.append("-" * 70)
+    for issue_type, issues in by_type.items():
+        lines.append(f"\n{issue_type.upper()}: {len(issues)} issues")
+        lines.append("-" * 40)
+        # Show up to 10 examples per type
+        for issue in issues[:10]:
+            lines.append(f"\n  Line {issue.line_number}:")
+            lines.append(f"    Input: {issue.input_text}")
+            lines.append(f"    Technical term: '{issue.technical_term}'")
+            lines.append(f"    Wrong expansion found: '{issue.wrong_expansion_found}'")
+            if issue.suggested_fix:
+                lines.append(f"    Suggested fix available: Yes")
+        if len(issues) > 10:
+            lines.append(f"\n  ... and {len(issues) - 10} more")
+    # Term statistics
+    lines.append("\n" + "-" * 70)
+    lines.append("TECHNICAL TERM OCCURRENCES IN DATASET:")
+    lines.append("-" * 70)
+    for term, count in sorted(result.term_statistics.items(), key=lambda x: -x[1]):
+        if count > 0:
+            lines.append(f"  {term}: {count} occurrences")
+    lines.append("\n" + "=" * 70)
+    return "\n".join(lines)
+def save_cleaned_dataset(filepath: Path, output_path: Path, result: AnalysisResult):
+    """Save a cleaned version of the dataset."""
+    issues_by_line = defaultdict(list)
+    for issue in result.issues_found:
+        issues_by_line[issue.line_number].append(issue)
+    fixed_count = 0
+    flagged_count = 0
+    with (
+        open(filepath, "r", encoding="utf-8") as f_in,
+        open(output_path, "w", encoding="utf-8") as f_out,
+    ):
+        for line_num, line in enumerate(f_in, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                example = json.loads(line)
+                if "query" not in example and "input" in example:
+                    example["query"] = example.pop("input")
+                output_raw = example.get("output", [])
+                if isinstance(output_raw, str):
+                    example["output"] = normalize_output_items(
+                        parse_output_text(output_raw)
+                    )
+                else:
+                    example["output"] = normalize_output_items(output_raw)
+                if line_num in issues_by_line:
+                    issues = issues_by_line[line_num]
+                    fixed = fix_example(example, issues)
+                    if fixed:
+                        f_out.write(json.dumps(fixed) + "\n")
+                        fixed_count += 1
+                    else:
+                        # Flag but don't fix ambiguous cases
+                        example["_flagged"] = True
+                        example["_flag_reason"] = (
+                            f"Ambiguous term '{issues[0].technical_term}' may need review"
+                        )
+                        f_out.write(json.dumps(example) + "\n")
+                        flagged_count += 1
+                else:
+                    f_out.write(json.dumps(example) + "\n")
+            except json.JSONDecodeError:
+                # Keep problematic lines as-is
+                f_out.write(line + "\n")
+    return fixed_count, flagged_count
+def main():
+    """Main entry point."""
+    # Paths
+    script_dir = Path(__file__).parent
+    input_path = script_dir / "data" / "qmd_expansion.jsonl"
+    output_path = script_dir / "data" / "qmd_expansion_cleaned.jsonl"
+    report_path = script_dir / "data" / "quality_report.txt"
+    print(f"Analyzing dataset: {input_path}")
+    print("-" * 50)
+    if not input_path.exists():
+        print(f"Error: Input file not found: {input_path}")
+        return 1
+    # Analyze the dataset
+    result = analyze_dataset(input_path)
+    # Generate and print report
+    report = generate_report(result)
+    print(report)
+    # Save report to file
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(report)
+    print(f"\nReport saved to: {report_path}")
+    # Save cleaned dataset
+    fixed_count, flagged_count = save_cleaned_dataset(input_path, output_path, result)
+    print(f"\nCleaned dataset saved to: {output_path}")
+    print(f"  - Examples fixed: {fixed_count}")
+    print(f"  - Examples flagged for review: {flagged_count}")
+    print(
+        f"  - Examples unchanged: {result.total_examples - fixed_count - flagged_count}"
+    )
+    # Summary statistics
+    print("\n" + "=" * 50)
+    print("SUMMARY")
+    print("=" * 50)
+    print(f"Total examples: {result.total_examples}")
+    print(f"Total issues found: {len(result.issues_found)}")
+    tech_issues = [
+        i for i in result.issues_found if i.issue_type == "wrong_tech_expansion"
+    ]
+    ambig_issues = [i for i in result.issues_found if i.issue_type == "ambiguous_term"]
+    print(f"  - Definite tech term errors: {len(tech_issues)}")
+    print(f"  - Ambiguous terms needing review: {len(ambig_issues)}")
+    if len(result.issues_found) > 0:
+        error_rate = len(result.issues_found) / result.total_examples * 100
+        print(f"\nError rate: {error_rate:.2f}%")
+    return 0
+if __name__ == "__main__":
+    exit(main())