PyPI - dataact - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dataact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

dataact/__init__.py +31 -0
dataact/agent.py +237 -0
dataact/cache.py +319 -0
dataact/exceptions.py +21 -0
dataact/format.py +108 -0
dataact/logger.py +66 -0
dataact/loop.py +153 -0
dataact/observe.py +31 -0
dataact/providers/__init__.py +0 -0
dataact/providers/anthropic.py +112 -0
dataact/providers/base.py +35 -0
dataact/providers/openai.py +125 -0
dataact/schema.py +79 -0
dataact/serialize.py +111 -0
dataact/testing.py +70 -0
dataact/tools/__init__.py +0 -0
dataact/tools/connectors.py +129 -0
dataact/tools/interpreter.py +189 -0
dataact/tools/planner.py +107 -0
dataact/tools/subagent.py +222 -0
dataact/tools/variables.py +25 -0
dataact/types.py +54 -0
dataact-0.1.0.dist-info/METADATA +212 -0
dataact-0.1.0.dist-info/RECORD +26 -0
dataact-0.1.0.dist-info/WHEEL +4 -0
dataact-0.1.0.dist-info/licenses/LICENSE +21 -0

dataact/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+from dataact.agent import Agent
+from dataact.exceptions import (
+    MaxTurnsExceeded,
+    SubagentRecursionError,
+    ToolNotFoundError,
+)
+from dataact.providers.base import NormalizedResponse, ProviderAdapter, StopReason
+from dataact.types import (
+    ContentBlock,
+    Message,
+    TextBlock,
+    ToolResultBlock,
+    ToolSpec,
+    ToolUseBlock,
+)
+__all__ = [
+    "Agent",
+    "ContentBlock",
+    "MaxTurnsExceeded",
+    "Message",
+    "NormalizedResponse",
+    "ProviderAdapter",
+    "StopReason",
+    "SubagentRecursionError",
+    "TextBlock",
+    "ToolNotFoundError",
+    "ToolResultBlock",
+    "ToolSpec",
+    "ToolUseBlock",
+]

dataact/agent.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""High-level `Agent` convenience layer.
+`Agent` is a thin composition over `Harness`, `SessionCache`, and the built-in
+tools. It exists so the quick-start example reads cleanly. The low-level
+primitives remain the canonical teaching surface — `agent.explain()` returns a
+sketch of the equivalent explicit wiring.
+`Agent.run()` is one-shot: each call builds a fresh `Harness` and starts with a
+new message history. It is not a chat session.
+"""
+from __future__ import annotations
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from dataact.cache import SessionCache
+from dataact.loop import Harness
+from dataact.providers.base import ProviderAdapter
+from dataact.schema import infer_input_schema
+from dataact.tools.connectors import ConnectorRegistry
+from dataact.tools.interpreter import PythonInterpreter
+from dataact.tools.planner import Planner
+from dataact.tools.subagent import make_subagent_spec
+from dataact.tools.variables import make_list_variables_spec
+from dataact.types import ToolSpec
+@dataclass(frozen=True)
+class _ConnectorToolDefinition:
+    connector_name: str
+    fn: Callable[..., Any]
+    description: str
+    input_schema: dict
+@dataclass(frozen=True)
+class _ConnectorDefinition:
+    name: str
+    description: str
+class ConnectorBuilder:
+    def __init__(self, agent: Agent, name: str) -> None:
+        self._agent = agent
+        self._name = name
+    def tool(
+        self,
+        fn: Callable[..., Any],
+        *,
+        description: str,
+        input_schema: dict | None = None,
+    ) -> Callable[..., Any]:
+        schema = input_schema if input_schema is not None else infer_input_schema(fn)
+        self._agent._connector_tools.append(
+            _ConnectorToolDefinition(
+                connector_name=self._name,
+                fn=fn,
+                description=description,
+                input_schema=schema,
+            )
+        )
+        return fn
+class Agent:
+    def __init__(
+        self,
+        adapter: ProviderAdapter,
+        system: str,
+        *,
+        max_turns: int = 25,
+        cache: SessionCache | None = None,
+        run_dir: str | Path | None = None,
+    ) -> None:
+        self._adapter = adapter
+        self._system = system
+        self._max_turns = max_turns
+        self._cache = cache if cache is not None else SessionCache()
+        self._run_dir = run_dir
+        self._last_harness: Harness | None = None
+        self._last_run_file: str | None = None
+        self._connectors: dict[str, _ConnectorDefinition] = {}
+        self._connector_tools: list[_ConnectorToolDefinition] = []
+        self._planner_enabled = False
+        self._subagent_factory: Callable[[], ProviderAdapter] | None = None
+    @property
+    def cache(self) -> SessionCache:
+        return self._cache
+    @property
+    def last_harness(self) -> Harness | None:
+        return self._last_harness
+    @property
+    def last_run_file(self) -> str | None:
+        return self._last_run_file
+    def connector(self, name: str, *, description: str) -> ConnectorBuilder:
+        self._connectors[name] = _ConnectorDefinition(
+            name=name,
+            description=description,
+        )
+        return ConnectorBuilder(self, name)
+    def enable_planner(self) -> Agent:
+        self._planner_enabled = True
+        return self
+    def enable_subagents(
+        self, *, adapter_factory: Callable[[], ProviderAdapter]
+    ) -> Agent:
+        self._subagent_factory = adapter_factory
+        return self
+    def run(self, user_message: str) -> str:
+        planner = Planner() if self._planner_enabled else None
+        tools = self._build_tools(planner=planner)
+        if self._subagent_factory is not None:
+            subagent_parent_tools = self._build_tools(planner=None)
+            effective_run_dir = (
+                str(self._run_dir) if self._run_dir is not None else "./runs"
+            )
+            tools.append(
+                make_subagent_spec(
+                    adapter_factory=self._subagent_factory,
+                    parent_tools=subagent_parent_tools,
+                    parent_cache=self._cache,
+                    run_dir=effective_run_dir,
+                    make_sub_tools=lambda sub_cache: self._build_tools(
+                        planner=None,
+                        cache=sub_cache,
+                    ),
+                )
+            )
+        harness_kwargs: dict = {
+            "adapter": self._adapter,
+            "system": self._system,
+            "tools": tools,
+            "max_turns": self._max_turns,
+            "cache": self._cache,
+        }
+        if self._run_dir is not None:
+            harness_kwargs["run_dir"] = str(self._run_dir)
+        harness = Harness(**harness_kwargs)
+        if planner is not None:
+            harness.register_reminder(planner.reminder_hook)
+        self._last_harness = harness
+        result = harness.run(user_message)
+        # Newest jsonl in the run dir belongs to this run
+        run_dir = Path(harness._run_dir)
+        files = sorted(run_dir.glob("*.jsonl"), key=lambda f: f.stat().st_mtime)
+        if files:
+            self._last_run_file = str(files[-1])
+        return result
+    def explain(self) -> str:
+        return _EXPLAIN_TEMPLATE.format(
+            system=_truncate(self._system),
+            max_turns=self._max_turns,
+            run_dir=self._run_dir if self._run_dir is not None else "./runs",
+        )
+    def _build_tools(
+        self,
+        *,
+        planner: Planner | None = None,
+        cache: SessionCache | None = None,
+    ) -> list[ToolSpec]:
+        target_cache = cache if cache is not None else self._cache
+        tools = [
+            PythonInterpreter.make_tool_spec(target_cache),
+            make_list_variables_spec(target_cache),
+        ]
+        if planner is not None:
+            tools.extend(planner.make_tool_specs())
+        if self._connectors:
+            registry = ConnectorRegistry()
+            for connector_name, connector in self._connectors.items():
+                registry.register(
+                    name=connector_name,
+                    description=connector.description,
+                    tools=[
+                        ToolSpec(
+                            name=f"{connector_name}__{definition.fn.__name__}",
+                            description=definition.description,
+                            input_schema=definition.input_schema,
+                            handler=definition.fn,
+                            visible=False,
+                        )
+                        for definition in self._connector_tools
+                        if definition.connector_name == connector_name
+                    ],
+                )
+            tools.append(registry.get_load_connectors_spec())
+            tools.extend(registry.make_wrapped_specs(target_cache))
+        return tools
+_EXPLAIN_TEMPLATE = """\
+Agent is a thin composition layer. The equivalent explicit wiring is:
+    from dataact.cache import SessionCache
+    from dataact.loop import Harness
+    from dataact.tools.interpreter import PythonInterpreter
+    from dataact.tools.variables import make_list_variables_spec
+    cache = SessionCache()
+    tools = [
+        PythonInterpreter.make_tool_spec(cache),
+        make_list_variables_spec(cache),
+    ]
+    harness = Harness(
+        adapter=adapter,
+        system={system!r},
+        tools=tools,
+        max_turns={max_turns},
+        run_dir={run_dir!r},
+        cache=cache,
+    )
+    harness.run(user_message)
+Each call to Agent.run() builds a fresh Harness with fresh tool specs.
+Model-visible tools include python_interpreter and list_variables.
+The message history resets per run; this is not a chat session.
+"""
+def _truncate(text: str, limit: int = 80) -> str:
+    if len(text) <= limit:
+        return text
+    return text[: limit - 1] + "…"

dataact/cache.py ADDED Viewed

@@ -0,0 +1,319 @@
+from __future__ import annotations
+import json
+import keyword
+import pickle
+import re
+import tempfile
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+_VALID_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
+def _is_valid_identifier(name: str) -> bool:
+    return bool(_VALID_IDENTIFIER.match(name)) and not keyword.iskeyword(name)
+@dataclass(frozen=True)
+class _ColdEntry:
+    path: Path
+    storage_type: str
+class SessionCache:
+    def __init__(
+        self,
+        sample_size: int = 5,
+        storage_dir: str | Path | None = None,
+        hot_limit: int | None = None,
+    ) -> None:
+        if hot_limit is not None and hot_limit < 1:
+            raise ValueError("hot_limit must be at least 1")
+        self.sample_size = sample_size
+        self.hot_limit = hot_limit
+        self._store: dict[str, Any] = {}
+        self._cold: dict[str, _ColdEntry] = {}
+        self._snapshots: dict[str, str] = {}
+        self._recency: OrderedDict[str, None] = OrderedDict()
+        self._temp_dir: tempfile.TemporaryDirectory[str] | None = None
+        if storage_dir is None and hot_limit is not None:
+            self._temp_dir = tempfile.TemporaryDirectory(prefix="dataact-cache-")
+            self._storage_dir = Path(self._temp_dir.name)
+        elif storage_dir is not None:
+            self._storage_dir = Path(storage_dir)
+            self._storage_dir.mkdir(parents=True, exist_ok=True)
+            if self.hot_limit is None:
+                # Supplying storage_dir opts into disk-backed cache behaviour.
+                # Keep the default bounded so a caller does not create a spill
+                # directory that is never used.
+                self.hot_limit = 10
+        else:
+            self._storage_dir = None
+    def put(self, name: str, value: Any, overwrite: bool = False) -> str:
+        if not _is_valid_identifier(name):
+            raise ValueError(
+                f"Invalid handle name: {name!r}. Must be a valid Python identifier."
+            )
+        if overwrite or not self.has_handle(name):
+            if overwrite:
+                self._delete_cold(name)
+            self._put_resolved(name, value)
+            return name
+        # Auto-suffix on collision
+        suffix = 2
+        while True:
+            candidate = f"{name}_{suffix}"
+            if not self.has_handle(candidate):
+                self._put_resolved(candidate, value)
+                return candidate
+            suffix += 1
+    def get(self, name: str) -> Any:
+        if name in self._store:
+            self._mark_recent(name)
+            return self._store[name]
+        if name in self._cold:
+            value = self._read_cold(name)
+            self._delete_cold(name)
+            self._store[name] = value
+            self._mark_recent(name)
+            self._enforce_hot_limit()
+            return value
+        raise KeyError(name)
+    def snapshot(self, handle: str) -> str:
+        if handle in self._snapshots:
+            return self._snapshots[handle]
+        value = self.get(handle)
+        snapshot = self._make_snapshot(value)
+        self._snapshots[handle] = snapshot
+        return snapshot
+    def list_handles(self) -> dict[str, str]:
+        return {name: self.snapshot(name) for name in self.handle_names()}
+    def handle_names(self) -> list[str]:
+        return list(self._recency.keys())
+    def has_handle(self, name: str) -> bool:
+        return name in self._store or name in self._cold
+    def items(self):
+        for name in self.handle_names():
+            yield name, self.get(name)
+    def storage_metadata(
+        self, *, include_paths: bool = False
+    ) -> dict[str, dict[str, str]]:
+        metadata = {}
+        for name in self.handle_names():
+            if name in self._cold:
+                entry = self._cold[name]
+                metadata[name] = {
+                    "location": "disk",
+                    "storage_type": entry.storage_type,
+                }
+                if include_paths:
+                    metadata[name]["path"] = str(entry.path)
+            else:
+                metadata[name] = {"location": "memory", "storage_type": "hot"}
+        return metadata
+    def delete(self, name: str) -> None:
+        if not self.has_handle(name):
+            raise KeyError(name)
+        self._store.pop(name, None)
+        self._delete_cold(name)
+        self._snapshots.pop(name, None)
+        self._recency.pop(name, None)
+    def close(self) -> None:
+        if self._temp_dir is not None:
+            self._temp_dir.cleanup()
+            self._temp_dir = None
+    def __del__(self) -> None:
+        self.close()
+    def _put_resolved(self, name: str, value: Any) -> None:
+        self._store[name] = value
+        self._snapshots[name] = self._make_snapshot(value)
+        self._mark_recent(name)
+        self._enforce_hot_limit()
+    def _mark_recent(self, name: str) -> None:
+        self._recency[name] = None
+        self._recency.move_to_end(name)
+    def _enforce_hot_limit(self) -> None:
+        if self._storage_dir is None or self.hot_limit is None:
+            return
+        while len(self._store) > self.hot_limit:
+            for candidate in self._recency:
+                if candidate in self._store:
+                    self._spill(candidate)
+                    break
+            else:
+                break
+    def _spill(self, name: str) -> None:
+        value = self._store.pop(name)
+        self._cold[name] = self._write_cold(name, value)
+    def _write_cold(self, name: str, value: Any) -> _ColdEntry:
+        if self._storage_dir is None:
+            raise RuntimeError("storage_dir is required for disk-backed cache")
+        try:
+            import numpy as np
+            if isinstance(value, np.ndarray) and value.dtype != object:
+                path = self._storage_dir / f"{name}.npy"
+                with path.open("wb") as fh:
+                    np.save(fh, value, allow_pickle=False)
+                return _ColdEntry(path=path, storage_type="numpy_npy")
+        except ImportError:
+            pass
+        try:
+            import pandas as pd
+            if isinstance(value, pd.DataFrame):
+                parquet_path = self._storage_dir / f"{name}.parquet"
+                try:
+                    value.to_parquet(parquet_path, index=False)
+                    return _ColdEntry(
+                        path=parquet_path,
+                        storage_type="dataframe_parquet",
+                    )
+                except (ImportError, TypeError, ValueError):
+                    # Parquet is the preferred teaching path, but pyarrow /
+                    # fastparquet are not core dependencies for this reference
+                    # implementation. Fall back explicitly rather than adding a
+                    # heavy storage dependency to the default install.
+                    pass
+                path = self._storage_dir / f"{name}.pkl"
+                value.to_pickle(path)
+                return _ColdEntry(path=path, storage_type="dataframe_pickle")
+        except ImportError:
+            pass
+        path = self._storage_dir / f"{name}.pickle"
+        with path.open("wb") as fh:
+            pickle.dump(value, fh, protocol=pickle.HIGHEST_PROTOCOL)
+        return _ColdEntry(path=path, storage_type="pickle")
+    def _read_cold(self, name: str) -> Any:
+        entry = self._cold[name]
+        if entry.storage_type == "numpy_npy":
+            import numpy as np
+            with entry.path.open("rb") as fh:
+                return np.load(fh, allow_pickle=False)
+        if entry.storage_type == "dataframe_parquet":
+            import pandas as pd
+            return pd.read_parquet(entry.path)
+        if entry.storage_type in {"dataframe_pickle", "pandas_pickle"}:
+            import pandas as pd
+            return pd.read_pickle(entry.path)
+        with entry.path.open("rb") as fh:
+            return pickle.load(fh)
+    def _delete_cold(self, name: str) -> None:
+        entry = self._cold.pop(name, None)
+        if entry is not None:
+            try:
+                entry.path.unlink()
+            except FileNotFoundError:
+                pass
+    def _make_snapshot(self, value: Any) -> str:
+        try:
+            import pandas as pd
+            if isinstance(value, pd.DataFrame):
+                return self._snapshot_dataframe(value)
+        except ImportError:
+            pass
+        try:
+            import numpy as np
+            if isinstance(value, np.ndarray):
+                return self._snapshot_ndarray(value)
+        except ImportError:
+            pass
+        if isinstance(value, list):
+            return self._snapshot_list(value)
+        if isinstance(value, dict):
+            return self._snapshot_dict(value)
+        # Scalar
+        return f"value: {value!r}"
+    def _snapshot_dataframe(self, df) -> str:
+        cols = list(df.columns)
+        shape = list(df.shape)
+        sample = df.head(self.sample_size).to_dict(orient="records")
+        return json.dumps(
+            {
+                "type": "dataframe",
+                "shape": shape,
+                "columns": cols,
+                "sample": sample,
+            },
+            default=str,
+        )
+    def _snapshot_ndarray(self, arr) -> str:
+        flat = arr.flat
+        sample = [
+            x.item() if hasattr(x, "item") else x
+            for _, x in zip(range(self.sample_size), flat)
+        ]
+        return json.dumps(
+            {
+                "type": "ndarray",
+                "shape": list(arr.shape),
+                "dtype": str(arr.dtype),
+                "sample": sample,
+            }
+        )
+    def _snapshot_list(self, lst: list) -> str:
+        sample = lst[: self.sample_size]
+        try:
+            sample_json = json.dumps(sample)
+            sample_value = json.loads(sample_json)
+        except Exception:
+            sample_value = repr(sample)
+        return json.dumps(
+            {
+                "type": "list",
+                "length": len(lst),
+                "sample": sample_value,
+            }
+        )
+    def _snapshot_dict(self, d: dict) -> str:
+        keys = list(d.keys())[: self.sample_size]
+        sample = {k: d[k] for k in keys}
+        try:
+            sample_str = json.dumps(sample, default=repr)
+        except Exception:
+            sample_str = repr(sample)
+        return json.dumps(
+            {
+                "type": "dict",
+                "total_keys": len(d),
+                "sample_keys": keys,
+                "sample": json.loads(sample_str),
+            }
+        )

dataact/exceptions.py ADDED Viewed

@@ -0,0 +1,21 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from dataact.providers.base import NormalizedResponse
+class MaxTurnsExceeded(RuntimeError):
+    def __init__(self, turns: int, last_response: "NormalizedResponse | None" = None):
+        self.turns = turns
+        self.last_response = last_response
+        super().__init__(f"Max turns exceeded: {turns}")
+class ToolNotFoundError(KeyError):
+    pass
+class SubagentRecursionError(RuntimeError):
+    pass