PyPI - rlm-code - Versions diffs - 0.1.7__tar.gz → 0.1.8__tar.gz - Mend

rlm-code 0.1.7tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (305) hide show

{rlm_code-0.1.7 → rlm_code-0.1.8}/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,13 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.1.8] - 2026-05-01
+### Added
+- AHE-style layered trace evidence corpus export from `TraceStore`.
+- New `trace_analysis` action `export_evidence_corpus` for writing `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans.
+- Evidence corpus tests covering direct store export and environment action export.
 ## [0.1.7] - 2026-04-30
 ### Added
@@ -69,4 +76,5 @@ Initial public release of **RLM Code**.
 [0.1.5]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.5
 [0.1.6]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.6
+[0.1.8]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.8
 [0.1.7]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.7

{rlm_code-0.1.7 → rlm_code-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rlm-code
-Version: 0.1.7
+Version: 0.1.8
 Summary: RLM Code: Research Playground & Evaluation OS for Recursive Language Model Agentic Systems
 Project-URL: Homepage, https://github.com/SuperagenticAI/rlm-code
 Project-URL: Documentation, https://superagenticai.github.io/rlm-code/
@@ -118,12 +118,13 @@ RLM Code implements the [Recursive Language Models](https://arxiv.org/abs/2502.0
 RLM Code wraps this algorithm in an interactive terminal UI with built-in benchmarks, trajectory replay, and observability.
-## Release v0.1.7
+## Release v0.1.8
-This release adds HALO-style trace analysis as a new RLM environment.
+This release extends HALO/AHE-style trace analysis with layered evidence export.
 - New `trace_analysis` environment for diagnosing agent harness failures from OTel-shaped JSONL traces
 - Sidecar trace indexing with dataset overview, query, count, search, full-trace view, and selected-span view actions
+- AHE-style evidence corpus export with `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans
 - Bounded payload handling for large traces, including oversized summaries and higher-cap surgical span reads
 - `/rlm` help/docs updated for `env=trace_analysis`
 - Dedicated trace analysis docs under the Core Engine section

{rlm_code-0.1.7 → rlm_code-0.1.8}/README.md RENAMED Viewed

@@ -25,12 +25,13 @@ RLM Code implements the [Recursive Language Models](https://arxiv.org/abs/2502.0
 RLM Code wraps this algorithm in an interactive terminal UI with built-in benchmarks, trajectory replay, and observability.
-## Release v0.1.7
+## Release v0.1.8
-This release adds HALO-style trace analysis as a new RLM environment.
+This release extends HALO/AHE-style trace analysis with layered evidence export.
 - New `trace_analysis` environment for diagnosing agent harness failures from OTel-shaped JSONL traces
 - Sidecar trace indexing with dataset overview, query, count, search, full-trace view, and selected-span view actions
+- AHE-style evidence corpus export with `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans
 - Bounded payload handling for large traces, including oversized summaries and higher-cap surgical span reads
 - `/rlm` help/docs updated for `env=trace_analysis`
 - Dedicated trace analysis docs under the Core Engine section

{rlm_code-0.1.7 → rlm_code-0.1.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "rlm-code"
-version = "0.1.7"
+version = "0.1.8"
 description = "RLM Code: Research Playground & Evaluation OS for Recursive Language Model Agentic Systems"
 readme = "README.md"
 license = "Apache-2.0"

{rlm_code-0.1.7 → rlm_code-0.1.8}/rlm_code/__init__.py RENAMED Viewed

@@ -5,5 +5,5 @@ This package provides tools for creating, managing, and optimizing DSPy componen
 through natural language interactions.
 """
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 __author__ = "Super Agentic AI"

{rlm_code-0.1.7 → rlm_code-0.1.8}/rlm_code/mcp/__init__.py RENAMED Viewed

@@ -17,7 +17,7 @@ from .exceptions import (
 )
 from .session_wrapper import MCPSessionWrapper
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 __all__ = [
     "MCPClientManager",

{rlm_code-0.1.7 → rlm_code-0.1.8}/rlm_code/rlm/environments.py RENAMED Viewed

@@ -306,8 +306,10 @@ class TraceAnalysisEnvironment(GenericRLMEnvironment):
             "Return ONLY valid JSON object with keys:\n"
             "{"
             '"action": "set_trace_path" | "get_dataset_overview" | "query_traces" | '
-            '"count_traces" | "view_trace" | "search_trace" | "view_spans" | "final", '
+            '"count_traces" | "view_trace" | "search_trace" | "view_spans" | '
+            '"export_evidence_corpus" | "final", '
             '"trace_path": "<path to JSONL traces>", '
+            '"output_dir": "<directory for exported evidence corpus>", '
             '"filters": {"has_errors": true, "model_names": ["..."], "service_names": ["..."], '
             '"agent_names": ["..."], "project_id": "..."}, '
             '"trace_id": "<trace id>", '
@@ -324,6 +326,7 @@ class TraceAnalysisEnvironment(GenericRLMEnvironment):
             "- Always begin analysis with get_dataset_overview.\n"
             "- Use query_traces to choose real trace ids; never invent trace ids.\n"
             "- For large traces, prefer search_trace followed by view_spans.\n"
+            "- Use export_evidence_corpus when the caller needs files for MetaHarness or another coding agent.\n"
             "- Identify systemic harness failures, not one-off anomalies.\n"
             "- Output JSON only."
         )
@@ -448,6 +451,21 @@ class TraceAnalysisEnvironment(GenericRLMEnvironment):
                     reward=0.7,
                     memory_note=f"Viewed selected spans for trace {trace_id}.",
                 )
+            if action_name == "export_evidence_corpus":
+                output_dir = self._required_str(action, "output_dir")
+                resolved_output = Path(output_dir).expanduser()
+                if not resolved_output.is_absolute():
+                    resolved_output = self.workdir / resolved_output
+                return self._ok(
+                    observation=store.export_evidence_corpus(
+                        resolved_output,
+                        filters,
+                        limit=self._int_arg(action, "limit", 100, minimum=1, maximum=1000),
+                        include_raw=self._bool_arg(action, "include_raw", True),
+                    ),
+                    reward=0.75,
+                    memory_note="Exported layered trace evidence corpus.",
+                )
         except Exception as exc:
             return EnvironmentActionResult(
                 observation={"success": False, "error": f"{type(exc).__name__}: {exc}"},
@@ -530,6 +548,19 @@ class TraceAnalysisEnvironment(GenericRLMEnvironment):
             parsed = default
         return max(minimum, min(maximum, parsed))
+    @staticmethod
+    def _bool_arg(action: dict[str, Any], key: str, default: bool) -> bool:
+        value = action.get(key, default)
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, str):
+            normalized = value.strip().lower()
+            if normalized in {"1", "true", "yes", "on"}:
+                return True
+            if normalized in {"0", "false", "no", "off"}:
+                return False
+        return default
 class DSPyCodingRLMEnvironment(GenericRLMEnvironment):
     """DSPy-focused environment with file edit + tests + DSPy-aware scoring."""

{rlm_code-0.1.7 → rlm_code-0.1.8}/rlm_code/traces/store.py RENAMED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import json
 import re
 from collections import Counter
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
@@ -16,6 +17,22 @@ SURGICAL_ATTR_CAP = 16384
 VIEW_TRACE_CHAR_BUDGET = 150_000
 OVERVIEW_SAMPLE_TRACE_IDS = 20
 NOISY_FLAT_PROJECTION_RE = re.compile(r"^(?:llm\.(?:input|output)_messages|mcp\.tools)\.\d+\.")
+EVIDENCE_ATTR_CAP = 2048
+TASK_ID_ATTRS = (
+    "inference.task_id",
+    "task_id",
+    "task.id",
+    "benchmark.task_id",
+    "appworld.task_id",
+)
+ISSUE_ATTRS = (
+    "error.message",
+    "exception.message",
+    "exception.type",
+    "tool.name",
+    "input.value",
+    "output.value",
+)
 def _truncate_value(value: Any, cap: int) -> Any:
@@ -168,6 +185,87 @@ class TraceStore:
             "truncated": len(matches) >= limit,
         }
+    def export_evidence_corpus(
+        self,
+        output_dir: str | Path,
+        filters: dict[str, Any] | None = None,
+        *,
+        limit: int = 100,
+        include_raw: bool = True,
+    ) -> dict[str, Any]:
+        """Export a layered evidence corpus for harness-optimization agents.
+        The corpus mirrors the AHE progressive-disclosure pattern:
+        a compact overview, one detail file per selected trace, an index, and
+        optional lightly processed raw JSONL spans for drill-down.
+        """
+        out = Path(output_dir).resolve()
+        detail_dir = out / "detail"
+        raw_dir = out / "raw"
+        detail_dir.mkdir(parents=True, exist_ok=True)
+        if include_raw:
+            raw_dir.mkdir(parents=True, exist_ok=True)
+        rows = self._filtered_rows(filters)[: max(0, limit)]
+        overview = self.get_overview(filters)
+        detail_entries: list[dict[str, Any]] = []
+        detail_lines = self._render_overview_markdown(overview, rows, include_raw=include_raw)
+        for row in rows:
+            spans = self._read_spans(row.trace_id)
+            safe_id = self._safe_filename(row.trace_id)
+            detail_path = detail_dir / f"{safe_id}.md"
+            raw_path = raw_dir / f"{safe_id}.jsonl" if include_raw else None
+            detail_path.write_text(
+                self._render_detail_markdown(row, spans, raw_path=raw_path),
+                encoding="utf-8",
+            )
+            if raw_path is not None:
+                self._write_raw_trace(raw_path, spans)
+            detail_entries.append(
+                {
+                    "trace_id": row.trace_id,
+                    "detail_path": str(detail_path),
+                    "raw_path": str(raw_path) if raw_path is not None else None,
+                    "has_errors": row.has_errors,
+                    "span_count": row.span_count,
+                    "task_ids": self._task_ids(spans),
+                    "error_span_count": sum(1 for span in spans if span.status_code == "STATUS_CODE_ERROR"),
+                }
+            )
+            detail_lines.append(
+                f"- `{row.trace_id}`: {row.span_count} spans, "
+                f"errors={'yes' if row.has_errors else 'no'}, detail=`detail/{safe_id}.md`"
+            )
+        overview_path = out / "overview.md"
+        index_path = out / "index.json"
+        overview_path.write_text("\n".join(detail_lines) + "\n", encoding="utf-8")
+        index_payload = {
+            "schema_version": "rlm-code.trace_evidence_corpus.v1",
+            "created_at": datetime.now(UTC).isoformat(),
+            "source_trace_path": str(self.trace_path),
+            "source_index_path": str(self.index_path),
+            "filters": filters or {},
+            "limit": limit,
+            "include_raw": include_raw,
+            "overview_path": str(overview_path),
+            "detail_dir": str(detail_dir),
+            "raw_dir": str(raw_dir) if include_raw else None,
+            "overview": overview,
+            "traces": detail_entries,
+        }
+        index_path.write_text(json.dumps(index_payload, indent=2, sort_keys=True), encoding="utf-8")
+        return {
+            "output_dir": str(out),
+            "overview_path": str(overview_path),
+            "index_path": str(index_path),
+            "detail_dir": str(detail_dir),
+            "raw_dir": str(raw_dir) if include_raw else None,
+            "trace_count": len(detail_entries),
+            "detail_paths": [entry["detail_path"] for entry in detail_entries],
+        }
     def _read_spans(self, trace_id: str) -> list[SpanRecord]:
         if trace_id not in self.rows_by_id:
             raise KeyError(trace_id)
@@ -219,3 +317,131 @@ class TraceStore:
             "total_output_tokens": row.total_output_tokens,
             "project_id": row.project_id,
         }
+    @staticmethod
+    def _render_overview_markdown(
+        overview: dict[str, Any],
+        rows: list[TraceIndexRow],
+        *,
+        include_raw: bool,
+    ) -> list[str]:
+        lines = [
+            "# Trace Evidence Overview",
+            "",
+            "Generated by `rlm-code` trace analysis.",
+            "",
+            "## Dataset",
+            "",
+            f"- Traces selected: {len(rows)}",
+            f"- Total matching traces: {overview['total_traces']}",
+            f"- Total matching spans: {overview['total_spans']}",
+            f"- Error traces: {overview['error_trace_count']}",
+            f"- Services: {', '.join(overview['service_names']) or '-'}",
+            f"- Models: {', '.join(overview['model_names']) or '-'}",
+            f"- Agents: {', '.join(overview['agent_names']) or '-'}",
+            f"- Input tokens: {overview['total_input_tokens']}",
+            f"- Output tokens: {overview['total_output_tokens']}",
+            f"- Raw span files included: {'yes' if include_raw else 'no'}",
+            "",
+            "## Trace Details",
+            "",
+        ]
+        return lines
+    def _render_detail_markdown(
+        self,
+        row: TraceIndexRow,
+        spans: list[SpanRecord],
+        *,
+        raw_path: Path | None,
+    ) -> str:
+        task_ids = self._task_ids(spans)
+        error_spans = [span for span in spans if span.status_code == "STATUS_CODE_ERROR"]
+        tool_spans = [span for span in spans if self._looks_like_tool_span(span)]
+        top_names = Counter(span.name for span in spans).most_common(10)
+        lines = [
+            f"# Trace Detail: {row.trace_id}",
+            "",
+            "## Summary",
+            "",
+            f"- Trace id: `{row.trace_id}`",
+            f"- Spans: {row.span_count}",
+            f"- Has errors: {'yes' if row.has_errors else 'no'}",
+            f"- Error spans: {len(error_spans)}",
+            f"- Task ids: {', '.join(task_ids) or '-'}",
+            f"- Services: {', '.join(row.service_names) or '-'}",
+            f"- Models: {', '.join(row.model_names) or '-'}",
+            f"- Agents: {', '.join(row.agent_names) or '-'}",
+            f"- Start: {row.start_time or '-'}",
+            f"- End: {row.end_time or '-'}",
+        ]
+        if raw_path is not None:
+            lines.append(f"- Raw spans: `{raw_path.name}`")
+        lines.extend(["", "## Span Name Counts", ""])
+        lines.extend(f"- `{name}`: {count}" for name, count in top_names)
+        lines.extend(["", "## Error Spans", ""])
+        if error_spans:
+            for span in error_spans:
+                lines.extend(self._render_span_evidence(span))
+        else:
+            lines.append("- None")
+        lines.extend(["", "## Tool-Like Spans", ""])
+        if tool_spans:
+            for span in tool_spans[:20]:
+                lines.extend(self._render_span_evidence(span))
+        else:
+            lines.append("- None")
+        return "\n".join(lines) + "\n"
+    @staticmethod
+    def _render_span_evidence(span: SpanRecord) -> list[str]:
+        lines = [
+            f"### `{span.name or span.span_id}`",
+            "",
+            f"- Span id: `{span.span_id}`",
+            f"- Parent span id: `{span.parent_span_id or '-'}`",
+            f"- Status: {span.status_code}",
+        ]
+        attrs = {
+            key: _truncate_value(span.attributes[key], EVIDENCE_ATTR_CAP)
+            for key in ISSUE_ATTRS
+            if key in span.attributes
+        }
+        if attrs:
+            lines.append("- Evidence attributes:")
+            for key, value in attrs.items():
+                lines.append(f"  - `{key}`: `{value}`")
+        return lines + [""]
+    @staticmethod
+    def _write_raw_trace(path: Path, spans: list[SpanRecord]) -> None:
+        with path.open("w", encoding="utf-8") as handle:
+            for span in spans:
+                handle.write(json.dumps(_render_span(span, SURGICAL_ATTR_CAP), sort_keys=True))
+                handle.write("\n")
+    @staticmethod
+    def _task_ids(spans: list[SpanRecord]) -> list[str]:
+        task_ids: set[str] = set()
+        for span in spans:
+            for key in TASK_ID_ATTRS:
+                value = span.attributes.get(key)
+                if isinstance(value, str) and value.strip():
+                    task_ids.add(value.strip())
+        return sorted(task_ids)
+    @staticmethod
+    def _looks_like_tool_span(span: SpanRecord) -> bool:
+        name = span.name.lower()
+        return (
+            "tool" in name
+            or "function" in name
+            or "tool.name" in span.attributes
+            or "input.value" in span.attributes
+            or "output.value" in span.attributes
+        )
+    @staticmethod
+    def _safe_filename(value: str) -> str:
+        safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("._")
+        return safe or "trace"

{rlm_code-0.1.7 → rlm_code-0.1.8}/tests/test_trace_analysis.py RENAMED Viewed

@@ -23,6 +23,7 @@ def _write_trace_fixture(path: Path) -> None:
                 "inference.llm.model_name": "gpt-test",
                 "inference.llm.input_tokens": 10,
                 "inference.llm.output_tokens": 5,
+                "inference.task_id": "task-ok",
             },
         },
         {
@@ -39,6 +40,7 @@ def _write_trace_fixture(path: Path) -> None:
                 "inference.project_id": "demo",
                 "inference.agent_name": "Root",
                 "inference.llm.model_name": "gpt-test",
+                "inference.task_id": "task-error",
                 "error.message": "hallucinated tool call spotify__login",
             },
         },
@@ -53,6 +55,7 @@ def _write_trace_fixture(path: Path) -> None:
             "status": {"code": "STATUS_CODE_ERROR"},
             "resource": {"attributes": {"service.name": "demo-agent"}},
             "attributes": {
+                "inference.task_id": "task-error",
                 "tool.name": "spotify__login",
                 "input.value": "{\"extra_argument\": true}",
                 "output.value": "Unknown tool argument: extra_argument",
@@ -84,6 +87,19 @@ def test_trace_store_indexes_and_queries_jsonl(tmp_path: Path) -> None:
     selected = store.view_spans("trace-error", ["span-tool-error"])
     assert selected["spans"][0]["name"] == "function.spotify__login"
+    exported = store.export_evidence_corpus(tmp_path / "evidence", {"has_errors": True})
+    assert exported["trace_count"] == 1
+    overview_text = (tmp_path / "evidence" / "overview.md").read_text(encoding="utf-8")
+    assert "Trace Evidence Overview" in overview_text
+    assert "`trace-error`" in overview_text
+    detail_text = (tmp_path / "evidence" / "detail" / "trace-error.md").read_text(encoding="utf-8")
+    assert "task-error" in detail_text
+    assert "spotify__login" in detail_text
+    assert (tmp_path / "evidence" / "raw" / "trace-error.jsonl").exists()
+    index_data = json.loads((tmp_path / "evidence" / "index.json").read_text(encoding="utf-8"))
+    assert index_data["schema_version"] == "rlm-code.trace_evidence_corpus.v1"
+    assert index_data["traces"][0]["task_ids"] == ["task-error"]
 def test_trace_analysis_environment_actions(tmp_path: Path) -> None:
     trace_path = tmp_path / "traces.jsonl"
@@ -113,3 +129,16 @@ def test_trace_analysis_environment_actions(tmp_path: Path) -> None:
     )
     assert searched.observation["success"] is True
     assert searched.observation["match_count"] == 1
+    exported = env.execute_action(
+        {
+            "action": "export_evidence_corpus",
+            "output_dir": "trace-evidence",
+            "filters": {"has_errors": True},
+        },
+        execution_engine=None,
+        exec_timeout=1,
+    )
+    assert exported.observation["success"] is True
+    assert exported.observation["trace_count"] == 1
+    assert (tmp_path / "trace-evidence" / "overview.md").exists()