PyPI - codeprobe - Versions diffs - 0.3.0__tar.gz → 0.3.1__tar.gz - Mend

codeprobe 0.3.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{codeprobe-0.3.0 → codeprobe-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeprobe
-Version: 0.3.0
+Version: 0.3.1
 Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
 Author: codeprobe contributors
 License-Expression: Apache-2.0
@@ -38,11 +38,11 @@ Dynamic: license-file
 Benchmark AI coding agents against **your own codebase**.
-Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
+Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
 ## Why codeprobe?
-Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
+Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique  workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
 ## Prerequisites

{codeprobe-0.3.0 → codeprobe-0.3.1}/README.md RENAMED Viewed

@@ -2,11 +2,11 @@
 Benchmark AI coding agents against **your own codebase**.
-Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
+Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
 ## Why codeprobe?
-Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
+Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique  workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
 ## Prerequisites

{codeprobe-0.3.0 → codeprobe-0.3.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "codeprobe"
-version = "0.3.0"
+version = "0.3.1"
 description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
 readme = "README.md"
 license = "Apache-2.0"
@@ -47,11 +47,9 @@ dev = [
 codeprobe = "codeprobe.cli:main"
 [project.entry-points."codeprobe.agents"]
-aider = "codeprobe.adapters.aider:AiderAdapter"
 claude = "codeprobe.adapters.claude:ClaudeAdapter"
 codex = "codeprobe.adapters.codex:CodexAdapter"
 copilot = "codeprobe.adapters.copilot:CopilotAdapter"
-openai = "codeprobe.adapters.openai_compat:OpenAICompatAdapter"
 [project.entry-points."codeprobe.sessions"]
 claude = "codeprobe.adapters.session:ClaudeSessionCollector"

{codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """codeprobe — Benchmark AI coding agents against your own codebase."""
-__version__ = "0.3.0"
+__version__ = "0.3.1"

{codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/assess/heuristics.py RENAMED Viewed

@@ -142,7 +142,12 @@ def _run_git(args: list[str], cwd: Path) -> str:
             timeout=30,
         )
         if result.returncode != 0:
-            logger.debug("git %s exited %d: %s", " ".join(args), result.returncode, result.stderr.strip())
+            logger.debug(
+                "git %s exited %d: %s",
+                " ".join(args),
+                result.returncode,
+                result.stderr.strip(),
+            )
             return ""
         return result.stdout.strip()
     except (subprocess.TimeoutExpired, OSError) as exc:
@@ -307,7 +312,9 @@ def gather_heuristics(repo_path: Path) -> RepoHeuristics:
     history, CI presence, test coverage, languages, and activity.
     """
     total_commits_str = _run_git(["rev-list", "--count", "HEAD"], cwd=repo_path)
-    merge_commits_str = _run_git(["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path)
+    merge_commits_str = _run_git(
+        ["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path
+    )
     contributors_str = _run_git(["shortlog", "-sn", "HEAD"], cwd=repo_path)
     file_list = _run_git(["ls-files"], cwd=repo_path)
@@ -354,7 +361,10 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
     has_ci = heuristics.has_ci
     has_fw = len(heuristics.test_frameworks) > 0
     if has_tests and has_ci and has_fw:
-        tc_score, tc_reason = 1.0, f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})"
+        tc_score, tc_reason = (
+            1.0,
+            f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})",
+        )
     elif has_tests and (has_ci or has_fw):
         tc_score, tc_reason = 0.7, "Tests present with partial CI/framework support"
     elif has_tests:
@@ -409,15 +419,29 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
         DimensionScore(name="ci_maturity", score=ci_score, reasoning=ci_reason),
     )
-    # Equal weights for heuristic path (model path lets the model weight them).
-    overall = sum(d.score for d in dimensions) / len(dimensions)
+    # Weighted average — ci_maturity is a weak signal because CI configs are
+    # often absent in shallow clones / Sourcegraph views, and codeprobe
+    # validates via mined test.sh scripts, not CI pipelines.
+    _WEIGHTS: dict[str, float] = {
+        "task_richness": 0.25,
+        "test_coverage": 0.25,
+        "complexity": 0.20,
+        "activity": 0.15,
+        "documentation": 0.10,
+        "ci_maturity": 0.05,
+    }
+    overall = sum(d.score * _WEIGHTS[d.name] for d in dimensions)
     if overall >= 0.7:
         recommendation = "Excellent benchmarking candidate — rich history with tests"
     elif overall >= 0.5:
-        recommendation = "Good candidate — may need more merge history for diverse tasks"
+        recommendation = (
+            "Good candidate — may need more merge history for diverse tasks"
+        )
     elif overall >= 0.3:
-        recommendation = "Fair candidate — limited test coverage may reduce task quality"
+        recommendation = (
+            "Fair candidate — limited test coverage may reduce task quality"
+        )
     else:
         recommendation = "Poor candidate — consider a repo with more history and tests"
@@ -458,11 +482,15 @@ def _parse_model_assessment(
         score_val = float(item.get("score", 0))
         score_val = max(0.0, min(1.0, score_val))
         reasoning = str(item.get("reasoning", ""))
-        dim_by_name[name] = DimensionScore(name=name, score=score_val, reasoning=reasoning)
+        dim_by_name[name] = DimensionScore(
+            name=name, score=score_val, reasoning=reasoning
+        )
     missing = set(RUBRIC_V1) - set(dim_by_name)
     if missing:
-        raise LLMParseError(f"Model response missing dimensions: {', '.join(sorted(missing))}")
+        raise LLMParseError(
+            f"Model response missing dimensions: {', '.join(sorted(missing))}"
+        )
     dimensions = tuple(dim_by_name[name] for name in RUBRIC_V1)
@@ -498,6 +526,11 @@ def score_repo_with_model(heuristics: RepoHeuristics) -> AssessmentScore:
         "You are evaluating a code repository's suitability for AI agent benchmarking.\n\n"
         f"Here are the raw repository statistics:\n{stats_json}\n\n"
         f"Score this repository on each of these dimensions (0.0 to 1.0):\n{rubric_list}\n\n"
+        "Weighting guidance for the overall score: task_richness and test_coverage "
+        "are the most important (~25% each), followed by complexity (~20%), "
+        "activity (~15%), documentation (~10%). ci_maturity should be a minor "
+        "signal (~5%) because CI configs are often absent in cloned repos and "
+        "codeprobe validates via mined test scripts, not CI pipelines.\n\n"
         "Respond with ONLY valid JSON matching this exact schema:\n"
         "{\n"
         '  "overall": <float 0.0-1.0>,\n'

{codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/__init__.py RENAMED Viewed

@@ -111,6 +111,14 @@ def init(path: str) -> None:
     default=None,
     help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
 )
+@click.option(
+    "--goal",
+    type=click.Choice(
+        ["quality", "navigation", "mcp", "general"], case_sensitive=False
+    ),
+    default=None,
+    help="Eval goal: quality, navigation, mcp, general. Skips interactive goal prompt.",
+)
 @click.option(
     "--profile",
     "profile_name",
@@ -241,6 +249,7 @@ def mine(
     ctx: click.Context,
     path: str,
     preset: str | None,
+    goal: str | None,
     profile_name: str | None,
     save_profile_name: str | None,
     list_profiles_flag: bool,
@@ -381,10 +390,12 @@ def mine(
         backends = _prof_val("backends", backends)  # type: ignore[assignment]
         interactive = _prof_val("interactive", interactive)  # type: ignore[assignment]
         preset = _prof_val("preset", preset)  # type: ignore[assignment]
+        goal = _prof_val("goal", goal)  # type: ignore[assignment]
     run_mine(
         path,
         preset=preset,
+        goal=goal,
         count=count,
         source=source,
         min_files=min_files,
@@ -463,6 +474,13 @@ def mine(
     default=False,
     help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
 )
+@click.option(
+    "--suite",
+    "suite_path",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to a suite.toml manifest to filter tasks by type, difficulty, and tags.",
+)
 @click.pass_context
 def run(
     ctx: click.Context,
@@ -478,6 +496,7 @@ def run(
     timeout: int | None,
     repeats: int | None,
     show_prompt: bool,
+    suite_path: str | None,
 ) -> None:
     """Run eval tasks against an AI coding agent.
@@ -510,6 +529,7 @@ def run(
         force_rich=force_rich,
         timeout=timeout,
         repeats=repeats if repeats is not None else 1,
+        suite_path=suite_path,
     )
@@ -690,3 +710,8 @@ main.add_command(preambles)
 from codeprobe.cli.doctor_cmd import doctor  # noqa: E402
 main.add_command(doctor)
+# Register the validate command
+from codeprobe.cli.validate_cmd import validate  # noqa: E402
+main.add_command(validate)

{codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/doctor_cmd.py RENAMED Viewed

@@ -83,7 +83,6 @@ def run_checks() -> list[CheckResult]:
         _check_tool(
             "codex", "Install OpenAI Codex CLI: https://github.com/openai/codex"
         ),
-        _check_tool("aider", "Install aider: https://aider.chat/docs/install.html"),
         _check_env_key(
             "ANTHROPIC_API_KEY", "Set ANTHROPIC_API_KEY in your environment."
         ),

codeprobe 0.3.0__tar.gz → 0.3.1__tar.gz

codeprobe 0.3.0tar.gz → 0.3.1tar.gz