PyPI - memahead - Versions diffs - 0.1.0__py3-none-any.whl - Mend

memahead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

memahead/__init__.py +72 -0
memahead/_embeddings.py +117 -0
memahead/compressor.py +310 -0
memahead/context.py +187 -0
memahead/plan.py +279 -0
memahead/scorer.py +171 -0
memahead/tool_filter.py +218 -0
memahead-0.1.0.dist-info/METADATA +147 -0
memahead-0.1.0.dist-info/RECORD +12 -0
memahead-0.1.0.dist-info/WHEEL +4 -0
memahead-0.1.0.dist-info/licenses/LICENSE +201 -0
memahead-0.1.0.dist-info/licenses/NOTICE +49 -0

memahead/__init__.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""memahead — agent memory optimized for what's ahead.
+memahead compresses an LLM agent's context at each step of a multi-step
+workflow using *forward-looking* plan awareness. Instead of compressing
+greedily based on what already happened, memahead scores each chunk of context
+against the *remaining* steps of the plan and drops what future steps won't
+need — far fewer tokens per call without losing what matters downstream.
+It builds on Headroom (``pip install headroom-ai``) for the underlying
+compression mechanics and adds the plan-aware retention scoring layer on top.
+Academic foundations:
+    - PAACE: Yuksel et al., arXiv:2512.16970 (Dec 2025)
+    - ACON:  Kang et al., Microsoft, arXiv:2510.00615 (2025)
+Quick start::
+    from memahead import Plan, Step, PlanAwareCompressor
+    plan = Plan([
+        Step("research", "Search and gather raw facts about the topic"),
+        Step("synthesize", "Identify key themes across the research"),
+        Step("draft", "Write a structured first draft"),
+        Step("revise", "Produce the final polished output"),
+    ])
+    compressor = PlanAwareCompressor(quality=0.85)
+    compressed = compressor.compress(
+        history=prior_messages,
+        tools=all_tool_schemas,
+        plan=plan,
+        current_step="synthesize",
+    )
+    print(compressed.report)
+"""
+from __future__ import annotations
+from .compressor import PlanAwareCompressor
+from .context import (
+    CompressedContext,
+    DroppedChunk,
+    TokenReport,
+    count_tokens,
+)
+from .plan import Plan, PlanGraph, Step
+from .scorer import ChunkScore, RetentionScorer
+from .tool_filter import ToolFilter, ToolMatch, filter_tools
+__version__ = "0.1.0"
+__all__ = [
+    "__version__",
+    # plan
+    "Step",
+    "Plan",
+    "PlanGraph",
+    # scoring (core novelty)
+    "RetentionScorer",
+    "ChunkScore",
+    # tool filtering
+    "ToolFilter",
+    "ToolMatch",
+    "filter_tools",
+    # compression pipeline
+    "PlanAwareCompressor",
+    # result containers
+    "CompressedContext",
+    "TokenReport",
+    "DroppedChunk",
+    "count_tokens",
+]

memahead/_embeddings.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Internal embedding utilities shared by the scorer and the tool filter.
+This module isolates the (optional, heavyweight) ``sentence-transformers``
+dependency behind a tiny, swappable interface. Anything that produces a 2-D
+array of row vectors from a list of strings can be used as an *embedder*,
+which keeps the rest of the library testable without downloading a model.
+"""
+from __future__ import annotations
+from typing import Callable, List, Sequence, Union
+import numpy as np
+__all__ = [
+    "Embedder",
+    "SentenceTransformerEmbedder",
+    "default_embedder",
+    "resolve_embedder",
+    "cosine_similarity_matrix",
+]
+# An embedder is any callable that maps a list of texts to a (n, dim) matrix.
+Embedder = Callable[[Sequence[str]], np.ndarray]
+DEFAULT_MODEL = "all-MiniLM-L6-v2"
+class SentenceTransformerEmbedder:
+    """Lazy wrapper around a ``sentence-transformers`` model.
+    The model is only imported and loaded on first use, so importing
+    :mod:`memahead` stays cheap and offline-friendly. The default model is
+    ``all-MiniLM-L6-v2`` as described in the PAACE/ACON-inspired design.
+    """
+    def __init__(self, model_name: str = DEFAULT_MODEL) -> None:
+        self.model_name = model_name
+        self._model = None
+    def _ensure_model(self):
+        if self._model is None:
+            try:
+                from sentence_transformers import SentenceTransformer
+            except ImportError as exc:  # pragma: no cover - environment dependent
+                raise ImportError(
+                    "sentence-transformers is required for the default embedder. "
+                    "Install it with `pip install sentence-transformers`, or pass a "
+                    "custom `embedder` callable to RetentionScorer / the tool filter."
+                ) from exc
+            self._model = SentenceTransformer(self.model_name)
+        return self._model
+    def __call__(self, texts: Sequence[str]) -> np.ndarray:
+        model = self._ensure_model()
+        vectors = model.encode(
+            list(texts),
+            convert_to_numpy=True,
+            normalize_embeddings=False,
+        )
+        return np.asarray(vectors, dtype=np.float32)
+# Process-wide cache so repeated scorers reuse the loaded weights.
+_DEFAULT_EMBEDDER: SentenceTransformerEmbedder | None = None
+def default_embedder(model_name: str = DEFAULT_MODEL) -> SentenceTransformerEmbedder:
+    """Return a cached default embedder backed by ``sentence-transformers``."""
+    global _DEFAULT_EMBEDDER
+    if _DEFAULT_EMBEDDER is None or _DEFAULT_EMBEDDER.model_name != model_name:
+        _DEFAULT_EMBEDDER = SentenceTransformerEmbedder(model_name)
+    return _DEFAULT_EMBEDDER
+def resolve_embedder(
+    embedder: Union[Embedder, "SentenceTransformerEmbedder", None],
+) -> Embedder:
+    """Normalize the many ways a caller can supply an embedder.
+    Accepts ``None`` (use the default model), a plain callable, or any object
+    exposing an ``encode`` method (e.g. a raw ``SentenceTransformer``).
+    """
+    if embedder is None:
+        return default_embedder()
+    if callable(embedder):
+        return embedder
+    encode = getattr(embedder, "encode", None)
+    if callable(encode):
+        return lambda texts: np.asarray(encode(list(texts)), dtype=np.float32)
+    raise TypeError(
+        "embedder must be None, a callable, or expose an `encode` method; "
+        f"got {type(embedder)!r}"
+    )
+def _l2_normalize(matrix: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+    matrix = np.asarray(matrix, dtype=np.float32)
+    if matrix.ndim == 1:
+        matrix = matrix.reshape(1, -1)
+    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
+    return matrix / np.maximum(norms, eps)
+def cosine_similarity_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """Return the (len(a), len(b)) matrix of cosine similarities.
+    Rows correspond to vectors in ``a`` (e.g. context chunks); columns to
+    vectors in ``b`` (e.g. remaining plan steps). Inputs need not be
+    pre-normalized.
+    """
+    a_norm = _l2_normalize(a)
+    b_norm = _l2_normalize(b)
+    return a_norm @ b_norm.T

memahead/compressor.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""The plan-aware compression pipeline.
+:class:`PlanAwareCompressor` ties the pieces together:
+    history + tools + plan + current_step
+        -> split history into chunks
+        -> score chunks against the *remaining* plan steps (RetentionScorer)
+        -> drop chunks future steps won't need (plan-aware retention)
+        -> filter tool schemas to the current step (tool_filter, no LLM)
+        -> hand survivors to Headroom for the actual compression mechanics
+        -> return a CompressedContext (+ TokenReport)
+memahead owns the *retention policy*; Headroom owns the *compression
+mechanics*. If Headroom is not installed the pipeline still works — it simply
+skips the mechanical compression step and relies on retention alone.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Sequence, Union
+from .context import (
+    CompressedContext,
+    DroppedChunk,
+    TokenReport,
+    _message_text,
+    count_message_tokens,
+    count_tool_tokens,
+    count_tokens,
+)
+from .plan import Plan, PlanGraph, Step
+from .scorer import RetentionScorer
+from .tool_filter import ToolFilter
+__all__ = ["PlanAwareCompressor"]
+PlanLike = Union[Plan, PlanGraph]
+class PlanAwareCompressor:
+    """Compress agent context using forward-looking, plan-aware retention.
+    Args:
+        quality: Information-retention dial in ``[0.0, 1.0]``. Higher keeps
+            more context (gentler compression); lower is more aggressive.
+            Defaults to ``0.85``.
+        retention_threshold: Optional absolute score cutoff in ``[0.0, 1.0]``.
+            When set, chunks scoring below it are dropped, overriding the
+            ``quality``-derived relative policy. Useful for reproducible runs.
+        tool_threshold: Match cutoff for keeping a tool schema.
+        scorer: A custom :class:`RetentionScorer` (e.g. with an injected
+            embedder). If ``None``, one is created lazily.
+        tool_filter: A custom :class:`ToolFilter`. If ``None``, one is created.
+        embedder: Convenience way to inject one embedder into both the scorer
+            and the tool filter (ignored if explicit ``scorer``/``tool_filter``
+            are given).
+        use_headroom: Whether to run survivors through Headroom for mechanical
+            compression. Defaults to ``True``; silently no-ops if Headroom is
+            unavailable.
+        model: Optional model name forwarded to Headroom and the tokenizer.
+        keep_system: Always retain ``system`` role messages. Defaults to True.
+        keep_last: Always retain the final message (the current turn's input).
+            Defaults to True.
+    """
+    def __init__(
+        self,
+        quality: float = 0.85,
+        *,
+        retention_threshold: Optional[float] = None,
+        tool_threshold: float = 0.3,
+        scorer: Optional[RetentionScorer] = None,
+        tool_filter: Optional[ToolFilter] = None,
+        embedder: Optional[Any] = None,
+        use_headroom: bool = True,
+        model: Optional[str] = None,
+        keep_system: bool = True,
+        keep_last: bool = True,
+    ) -> None:
+        if not 0.0 <= quality <= 1.0:
+            raise ValueError("quality must be in [0.0, 1.0]")
+        if retention_threshold is not None and not 0.0 <= retention_threshold <= 1.0:
+            raise ValueError("retention_threshold must be in [0.0, 1.0]")
+        self.quality = quality
+        self.retention_threshold = retention_threshold
+        self.use_headroom = use_headroom
+        self.model = model
+        self.keep_system = keep_system
+        self.keep_last = keep_last
+        self.scorer = scorer or RetentionScorer(embedder=embedder)
+        self.tool_filter = tool_filter or ToolFilter(
+            embedder=embedder, threshold=tool_threshold
+        )
+    # -- helpers ------------------------------------------------------------
+    @staticmethod
+    def _resolve_step(plan: PlanLike, current_step: Union[Step, str]) -> Step:
+        if isinstance(current_step, Step):
+            return current_step
+        return plan.get(current_step)
+    @staticmethod
+    def _is_system(message: Any) -> bool:
+        return isinstance(message, dict) and message.get("role") == "system"
+    def _always_keep_mask(self, history: List[Any]) -> List[bool]:
+        n = len(history)
+        mask = [False] * n
+        for i, msg in enumerate(history):
+            if self.keep_system and self._is_system(msg):
+                mask[i] = True
+        if self.keep_last and n > 0:
+            mask[n - 1] = True
+        return mask
+    def _decide_retention(
+        self,
+        scores: List[float],
+        always_keep: List[bool],
+        has_future: bool,
+    ) -> List[bool]:
+        """Return a keep/drop flag per chunk from scores + policy."""
+        n = len(scores)
+        if n == 0:
+            return []
+        # No future steps -> nothing to prune against; keep everything.
+        if not has_future:
+            return [True] * n
+        if self.retention_threshold is not None:
+            keep = [s >= self.retention_threshold for s in scores]
+        else:
+            # Relative policy: min-max normalize, then keep the top band as
+            # governed by `quality`. quality=0.85 -> keep normalized >= 0.15.
+            lo = min(scores)
+            hi = max(scores)
+            cutoff = 1.0 - self.quality
+            if hi - lo < 1e-9:
+                # All equal: a flat horizon. Keep them all rather than guess.
+                keep = [True] * n
+            else:
+                keep = [((s - lo) / (hi - lo)) >= cutoff for s in scores]
+        for i in range(n):
+            if always_keep[i]:
+                keep[i] = True
+        return keep
+    def _apply_headroom(self, messages: List[Any]) -> List[Any]:
+        """Run messages through Headroom's ``compress`` if available.
+        Defensive by design: any import error, signature mismatch, or
+        unexpected return shape falls back to the input unchanged so that
+        retention-only compression still works.
+        """
+        if not self.use_headroom or not messages:
+            return messages
+        try:
+            from headroom import compress  # type: ignore
+        except Exception:
+            return messages
+        try:
+            result = compress(messages, model=self.model) if self.model else compress(messages)
+        except TypeError:
+            try:
+                result = compress(messages)
+            except Exception:
+                return messages
+        except Exception:
+            return messages
+        return self._normalize_headroom_result(result, fallback=messages)
+    @staticmethod
+    def _normalize_headroom_result(result: Any, fallback: List[Any]) -> List[Any]:
+        if result is None:
+            return fallback
+        if isinstance(result, list):
+            return result
+        # Common attribute names across compression libraries.
+        for attr in ("messages", "compressed", "output", "result"):
+            value = getattr(result, attr, None)
+            if isinstance(value, list):
+                return value
+        if isinstance(result, dict):
+            for key in ("messages", "compressed", "output", "result"):
+                value = result.get(key)
+                if isinstance(value, list):
+                    return value
+        return fallback
+    # -- public API ---------------------------------------------------------
+    def compress(
+        self,
+        history: Sequence[Any],
+        tools: Sequence[Any],
+        plan: PlanLike,
+        current_step: Union[Step, str],
+    ) -> CompressedContext:
+        """Compress ``history`` and ``tools`` for the given step of ``plan``.
+        Args:
+            history: Prior chat messages (dicts with ``role``/``content``, or
+                plain strings). Each message is treated as one context chunk.
+            tools: The full catalog of tool schemas available to the agent.
+            plan: The :class:`Plan` (or :class:`PlanGraph`) being executed.
+            current_step: The step about to run — the pivot for "what's ahead".
+        Returns:
+            A :class:`CompressedContext` with lean ``messages``, filtered
+            ``tools``, and a :class:`TokenReport`.
+        """
+        history = list(history)
+        tools = list(tools or [])
+        step = self._resolve_step(plan, current_step)
+        step_key = step.name if isinstance(current_step, Step) else str(current_step)
+        remaining_steps = plan.remaining_from(step_key)
+        has_future = len(remaining_steps) > 0
+        before_tokens = count_message_tokens(history, self.model) + count_tool_tokens(
+            tools, self.model
+        )
+        # 1) chunk + score against the forward horizon.
+        chunk_texts = [_message_text(m) for m in history]
+        always_keep = self._always_keep_mask(history)
+        if history:
+            chunk_scores = self.scorer.score(chunk_texts, remaining_steps)
+            scores = [cs.score for cs in chunk_scores]
+        else:
+            scores = []
+        # 2) decide retention.
+        keep_flags = self._decide_retention(scores, always_keep, has_future)
+        retained_messages: List[Any] = []
+        retained_scores: Dict[str, float] = {}
+        dropped: List[DroppedChunk] = []
+        for i, msg in enumerate(history):
+            source = f"message[{i}]"
+            tok = count_tokens(chunk_texts[i], self.model)
+            score = scores[i] if i < len(scores) else None
+            if keep_flags[i]:
+                retained_messages.append(msg)
+                if score is not None:
+                    retained_scores[source] = round(score, 4)
+            else:
+                dropped.append(
+                    DroppedChunk(
+                        source=source,
+                        kind="message",
+                        tokens_before=tok,
+                        tokens_after=0,
+                        score=round(score, 4) if score is not None else None,
+                        reason="below retention threshold for remaining plan steps",
+                    )
+                )
+        # 3) filter tools to the current step (deterministic, no LLM call).
+        tool_matches = self.tool_filter.match(tools, step)
+        kept_tools = [m.tool for m in tool_matches if m.kept]
+        for m in tool_matches:
+            if not m.kept:
+                dropped.append(
+                    DroppedChunk(
+                        source=f"tool:{m.name or '?'}",
+                        kind="tool",
+                        tokens_before=count_tokens(_tool_schema_text(m.tool), self.model),
+                        tokens_after=0,
+                        score=round(m.score, 4),
+                        reason="tool not relevant to current step",
+                    )
+                )
+        # 4) hand survivors to Headroom for mechanical compression.
+        compressed_messages = self._apply_headroom(retained_messages)
+        after_tokens = count_message_tokens(
+            compressed_messages, self.model
+        ) + count_tool_tokens(kept_tools, self.model)
+        report = TokenReport(
+            before=before_tokens,
+            after=after_tokens,
+            dropped=dropped,
+        )
+        return CompressedContext(
+            messages=compressed_messages,
+            tools=kept_tools,
+            report=report,
+            retained_scores=retained_scores,
+        )
+def _tool_schema_text(tool: Any) -> str:
+    from .context import _tool_text
+    return _tool_text(tool)

memahead/context.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""Result containers for compression: :class:`CompressedContext` and
+:class:`TokenReport`, plus a small token-estimation helper.
+These are intentionally dependency-light dataclasses so they can be passed
+around, serialized, and inspected without importing heavy ML packages.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+__all__ = ["count_tokens", "DroppedChunk", "TokenReport", "CompressedContext"]
+def count_tokens(text: str, model: Optional[str] = None) -> int:
+    """Estimate the number of tokens in ``text``.
+    Uses :mod:`tiktoken` when available for accuracy; otherwise falls back to
+    a fast heuristic (~4 characters per token). The heuristic keeps the
+    library usable with zero extra dependencies while still giving a stable,
+    monotonic measure for reporting savings.
+    Args:
+        text: The text to measure.
+        model: Optional model name used to pick a tiktoken encoding.
+    """
+    if not text:
+        return 0
+    try:
+        import tiktoken
+        try:
+            encoding = (
+                tiktoken.encoding_for_model(model)
+                if model
+                else tiktoken.get_encoding("cl100k_base")
+            )
+        except KeyError:
+            encoding = tiktoken.get_encoding("cl100k_base")
+        return len(encoding.encode(text))
+    except Exception:
+        # Heuristic fallback: ~4 chars/token, with a floor of one token per word.
+        char_estimate = (len(text) + 3) // 4
+        word_estimate = len(text.split())
+        return max(char_estimate, word_estimate)
+def _message_text(message: Any) -> str:
+    """Extract the textual content from a chat message (dict or str)."""
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        content = message.get("content", "")
+        if isinstance(content, str):
+            return content
+        # Content can be a list of parts (OpenAI-style multimodal blocks).
+        if isinstance(content, list):
+            parts: List[str] = []
+            for part in content:
+                if isinstance(part, str):
+                    parts.append(part)
+                elif isinstance(part, dict):
+                    parts.append(str(part.get("text", "")))
+            return "\n".join(parts)
+        return str(content)
+    return str(message)
+def _tool_text(tool: Any) -> str:
+    """Extract a stable textual representation of a tool schema for counting."""
+    if isinstance(tool, str):
+        return tool
+    if isinstance(tool, dict):
+        # Support both the bare schema and the OpenAI {"type","function":{...}}
+        # envelope so token counts reflect what is actually sent.
+        import json
+        return json.dumps(tool, sort_keys=True, default=str)
+    return str(tool)
+def count_message_tokens(messages: List[Any], model: Optional[str] = None) -> int:
+    """Total estimated tokens across a list of chat messages."""
+    return sum(count_tokens(_message_text(m), model) for m in messages)
+def count_tool_tokens(tools: List[Any], model: Optional[str] = None) -> int:
+    """Total estimated tokens across a list of tool schemas."""
+    return sum(count_tokens(_tool_text(t), model) for t in (tools or []))
+@dataclass
+class DroppedChunk:
+    """Record of a single context chunk that was dropped or shrunk.
+    Attributes:
+        source: A human-readable origin (e.g. ``"message[3]"`` or a tool name).
+        kind: ``"message"`` or ``"tool"``.
+        score: The forward-looking retention score (0.0–1.0), if applicable.
+        tokens_before: Tokens the chunk occupied before compression.
+        tokens_after: Tokens remaining after compression (0 if fully dropped).
+        reason: Why it was dropped (e.g. ``"below retention threshold"``).
+    """
+    source: str
+    kind: str
+    tokens_before: int
+    tokens_after: int = 0
+    score: Optional[float] = None
+    reason: str = ""
+    @property
+    def tokens_saved(self) -> int:
+        return max(self.tokens_before - self.tokens_after, 0)
+@dataclass
+class TokenReport:
+    """Summary of how many tokens compression saved.
+    Attributes:
+        before: Total tokens before compression (messages + tools).
+        after: Total tokens after compression (messages + tools).
+        dropped: Per-chunk records of what was removed or shrunk.
+    """
+    before: int
+    after: int
+    dropped: List[DroppedChunk] = field(default_factory=list)
+    @property
+    def saved(self) -> int:
+        """Absolute number of tokens saved."""
+        return max(self.before - self.after, 0)
+    @property
+    def compression_ratio(self) -> float:
+        """Fraction of tokens removed, in ``[0.0, 1.0]``.
+        ``0.0`` means nothing was saved; ``0.75`` means 75% fewer tokens.
+        """
+        if self.before <= 0:
+            return 0.0
+        return round(self.saved / self.before, 4)
+    def dropped_sources(self) -> List[str]:
+        """Return the sources of fully-dropped chunks."""
+        return [d.source for d in self.dropped if d.tokens_after == 0]
+    def __repr__(self) -> str:
+        return (
+            f"TokenReport(before={self.before}, after={self.after}, "
+            f"saved={self.saved}, compression_ratio={self.compression_ratio})"
+        )
+@dataclass
+class CompressedContext:
+    """The lean, ready-to-send context produced by the compressor.
+    Attributes:
+        messages: Compressed chat messages, ready to pass to an LLM call.
+        tools: Filtered tool schemas relevant to the current step.
+        report: A :class:`TokenReport` describing what was saved.
+        retained_scores: Mapping of retained message source -> retention score,
+            useful for debugging and evaluation.
+    """
+    messages: List[Any]
+    tools: List[Any]
+    report: TokenReport
+    retained_scores: Dict[str, float] = field(default_factory=dict)
+    def __repr__(self) -> str:
+        return (
+            f"CompressedContext(messages={len(self.messages)}, "
+            f"tools={len(self.tools)}, report={self.report!r})"
+        )