PyPI - dataact - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dataact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

dataact/__init__.py +31 -0
dataact/agent.py +237 -0
dataact/cache.py +319 -0
dataact/exceptions.py +21 -0
dataact/format.py +108 -0
dataact/logger.py +66 -0
dataact/loop.py +153 -0
dataact/observe.py +31 -0
dataact/providers/__init__.py +0 -0
dataact/providers/anthropic.py +112 -0
dataact/providers/base.py +35 -0
dataact/providers/openai.py +125 -0
dataact/schema.py +79 -0
dataact/serialize.py +111 -0
dataact/testing.py +70 -0
dataact/tools/__init__.py +0 -0
dataact/tools/connectors.py +129 -0
dataact/tools/interpreter.py +189 -0
dataact/tools/planner.py +107 -0
dataact/tools/subagent.py +222 -0
dataact/tools/variables.py +25 -0
dataact/types.py +54 -0
dataact-0.1.0.dist-info/METADATA +212 -0
dataact-0.1.0.dist-info/RECORD +26 -0
dataact-0.1.0.dist-info/WHEEL +4 -0
dataact-0.1.0.dist-info/licenses/LICENSE +21 -0

dataact/tools/subagent.py ADDED Viewed

@@ -0,0 +1,222 @@
+from __future__ import annotations
+import copy
+import dataclasses
+from typing import Callable
+from dataact.cache import SessionCache
+from dataact.providers.base import ProviderAdapter
+from dataact.tools.interpreter import PythonInterpreter
+from dataact.tools.variables import make_list_variables_spec
+from dataact.types import ToolSpec
+_SUBAGENT_TOOL_NAME = "subagent"
+_WORKER_SYSTEM_TEMPLATE = """\
+You are a clean-context worker invoked by another agent.
+Your task: {task}
+Available input handles (already loaded into your cache): {input_handles}
+Use `python_interpreter` to inspect cached handles. Call `save(name, value)` for any
+computed artifact worth returning. You must produce final text summarizing your
+findings. If you save artifacts, mention what they contain and why they matter."""
+def make_subagent_spec(
+    adapter_factory: Callable[[], ProviderAdapter],
+    parent_tools: list[ToolSpec],
+    parent_cache: SessionCache,
+    run_dir: str = "./runs",
+    get_sub_cache: Callable[[], SessionCache] | None = None,
+    make_sub_tools: Callable[[SessionCache], list[ToolSpec]] | None = None,
+) -> ToolSpec:
+    """Create a subagent tool with an explicit cache boundary.
+    If parent_tools include cache-bound wrappers such as ConnectorRegistry
+    wrapped specs, pass make_sub_tools so those handlers can be rebuilt against
+    the subagent cache. The fallback path only copies cache-independent tools
+    and the built-in cache tools it knows how to rebind.
+    """
+    def subagent(
+        task: str,
+        input_handles: list[str] | None = None,
+        output_policy: str = "text_only",
+    ) -> str:
+        from dataact.loop import Harness
+        # Validate input_handles against parent cache
+        if input_handles:
+            missing = [h for h in input_handles if not parent_cache.has_handle(h)]
+            if missing:
+                return f"Error: input handles not found in parent cache: {missing}"
+        # Build sub-cache
+        if get_sub_cache is not None:
+            sub_cache = get_sub_cache()
+        else:
+            sub_cache = SessionCache(sample_size=parent_cache.sample_size)
+        # Copy requested handles into sub-cache
+        if input_handles:
+            for handle in input_handles:
+                try:
+                    sub_cache.put(handle, _copy_cache_value(parent_cache.get(handle)))
+                except Exception as exc:
+                    return (
+                        "Error: failed to copy input handle "
+                        f"{handle!r}: {type(exc).__name__}: {exc}"
+                    )
+        # Track pre-run handles to detect newly created ones
+        pre_run_handles = set(sub_cache.handle_names())
+        # Build sub-tools — exclude subagent to prevent recursion
+        try:
+            if make_sub_tools is not None:
+                sub_tools = [
+                    dataclasses.replace(t)
+                    for t in make_sub_tools(sub_cache)
+                    if t.name != _SUBAGENT_TOOL_NAME
+                ]
+            else:
+                sub_tools = [
+                    _copy_tool_for_subcache(t, sub_cache)
+                    for t in parent_tools
+                    if t.name != _SUBAGENT_TOOL_NAME
+                ]
+        except ValueError as exc:
+            return f"Error: subagent tools are not isolated: {exc}"
+        # Build system prompt
+        handles_str = str(input_handles) if input_handles else "none"
+        system = _WORKER_SYSTEM_TEMPLATE.format(task=task, input_handles=handles_str)
+        # Spawn fresh adapter
+        sub_adapter = adapter_factory()
+        sub_harness = Harness(
+            adapter=sub_adapter,
+            system=system,
+            tools=sub_tools,
+            run_dir=run_dir,
+            cache=sub_cache,
+        )
+        try:
+            final_text = sub_harness.run(task)
+        except Exception as exc:
+            return f"Error: subagent failed: {type(exc).__name__}: {exc}"
+        if output_policy == "text_only":
+            return f"Subagent final output:\n{final_text}"
+        # publish_created: find newly created handles
+        new_handles = {}
+        for name in sub_cache.handle_names():
+            if name in pre_run_handles:
+                continue
+            new_handles[name] = sub_cache.get(name)
+        if not new_handles:
+            return f"Subagent final output:\n{final_text}\n\nPublished outputs: none"
+        published_lines = []
+        for sub_name, value in new_handles.items():
+            parent_name = parent_cache.put(sub_name, value)
+            snap = parent_cache.snapshot(parent_name)
+            published_lines.append(f"- {sub_name} -> {parent_name}\n  Snapshot: {snap}")
+        published_str = "\n".join(published_lines)
+        return (
+            f"Subagent final output:\n{final_text}\n\n"
+            f"Published outputs:\n{published_str}"
+        )
+    return ToolSpec(
+        name=_SUBAGENT_TOOL_NAME,
+        description=(
+            "Spawn a clean-context subagent to handle a subtask. "
+            "The subagent has fresh message history and session cache. "
+            "Use input_handles to pass data from this cache to the subagent."
+        ),
+        input_schema={
+            "type": "object",
+            "properties": {
+                "task": {
+                    "type": "string",
+                    "description": "Natural-language instruction for the subagent.",
+                },
+                "input_handles": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": (
+                        "Parent cache handle names to copy into the subagent's cache."
+                    ),
+                },
+                "output_policy": {
+                    "type": "string",
+                    "enum": ["text_only", "publish_created"],
+                    "description": (
+                        "'text_only': return only the subagent's final text. "
+                        "'publish_created': also copy newly-created handles back to"
+                        " parent cache."
+                    ),
+                },
+            },
+            "required": ["task"],
+        },
+        handler=subagent,
+    )
+def _copy_tool_for_subcache(tool: ToolSpec, sub_cache: SessionCache) -> ToolSpec:
+    if tool.name == "python_interpreter":
+        return PythonInterpreter.make_tool_spec(sub_cache)
+    if tool.name == "list_variables":
+        return make_list_variables_spec(sub_cache)
+    if _handler_closes_over_cache(tool.handler):
+        raise ValueError(
+            f"{tool.name!r} has a handler closed over a SessionCache. "
+            "Pass make_sub_tools to rebuild cache-bound tool specs for the "
+            "subagent cache."
+        )
+    return dataclasses.replace(tool)
+def _handler_closes_over_cache(handler) -> bool:
+    closure = getattr(handler, "__closure__", None)
+    if not closure:
+        return False
+    for cell in closure:
+        try:
+            if isinstance(cell.cell_contents, SessionCache):
+                return True
+        except ValueError:
+            continue
+    return False
+def _copy_cache_value(value):
+    try:
+        import pandas as pd
+        if isinstance(value, pd.DataFrame):
+            # Deep copy is deliberate: shallow DataFrame copies can still share
+            # underlying blocks, which would break the parent/subagent boundary
+            # for representative in-place mutations.
+            return value.copy(deep=True)
+    except ImportError:
+        pass
+    try:
+        import numpy as np
+        if isinstance(value, np.ndarray):
+            return value.copy()
+    except ImportError:
+        pass
+    return copy.deepcopy(value)

dataact/tools/variables.py ADDED Viewed

@@ -0,0 +1,25 @@
+from __future__ import annotations
+from dataact.cache import SessionCache
+from dataact.types import ToolSpec
+def make_list_variables_spec(cache: SessionCache) -> ToolSpec:
+    def list_variables() -> str:
+        handles = cache.list_handles()
+        if not handles:
+            return "No variables in session cache."
+        lines = [f"Session cache ({len(handles)} handle(s)):"]
+        for name, snapshot in handles.items():
+            lines.append(f"\n  {name}:\n    {snapshot}")
+        return "\n".join(lines)
+    return ToolSpec(
+        name="list_variables",
+        description=(
+            "List all variables currently stored in the session cache"
+            " with their snapshots."
+        ),
+        input_schema={"type": "object", "properties": {}},
+        handler=list_variables,
+    )

dataact/types.py ADDED Viewed

@@ -0,0 +1,54 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Callable, Literal
+@dataclass
+class TextBlock:
+    text: str
+@dataclass
+class ToolUseBlock:
+    tool_use_id: str
+    tool_name: str
+    tool_input: dict
+@dataclass
+class ToolResultBlock:
+    tool_use_id: str
+    content: str
+    is_error: bool = False
+ContentBlock = TextBlock | ToolUseBlock | ToolResultBlock
+@dataclass
+class Message:
+    role: Literal["user", "assistant"]
+    content: list[ContentBlock]
+    def __post_init__(self) -> None:
+        if self.role not in ("user", "assistant"):
+            raise ValueError(
+                f"Invalid role: {self.role!r}. Must be 'user' or 'assistant'."
+            )
+@dataclass
+class ToolSpec:
+    name: str
+    description: str
+    input_schema: dict
+    handler: Callable[..., Any] | None = None
+    visible: bool = True
+    def to_provider_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "description": self.description,
+            "input_schema": self.input_schema,
+        }

dataact-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,212 @@
+Metadata-Version: 2.4
+Name: dataact
+Version: 0.1.0
+Summary: Data agent with Python-native tools (no bash)
+Project-URL: Homepage, https://github.com/maxkskhor/dataact
+Project-URL: Repository, https://github.com/maxkskhor/dataact
+Project-URL: Issues, https://github.com/maxkskhor/dataact/issues
+Author: Max Khor
+License: MIT
+License-File: LICENSE
+Requires-Python: >=3.10
+Requires-Dist: anthropic
+Requires-Dist: loguru
+Requires-Dist: numpy
+Requires-Dist: pandas
+Provides-Extra: dev
+Requires-Dist: openai; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: pytest-mock; extra == 'dev'
+Requires-Dist: python-dotenv; extra == 'dev'
+Provides-Extra: openai
+Requires-Dist: openai; extra == 'openai'
+Description-Content-Type: text/markdown
+# dataact
+*(data + ReAct — a ReAct agent harness built for data workflows)*
+A minimal, transparent, data-native agent harness for Python — built without bash.
+Most agent frameworks hand the model a shell and call it a day. `dataact` takes a different approach: the model operates entirely through a sandboxed Python interpreter, with data stored in a session cache and exposed as named handles. No bash. No framework magic. Just a loop you can read in an afternoon.
+Built as an installable reference implementation for engineers who want to understand how a production-style harness actually works. It is not a polished SDK surface; the convenience API exists to remove setup noise while keeping the harness boundaries visible.
+The design is covered in a three-part series:
+- [Designing a ReAct Harness for Data Workflows Without Bash](https://maxkskhor.substack.com/p/designing-a-react-harness-for-data)
+- [How a Bash-Free Data Agent Remembers Its Work](https://maxkskhor.substack.com/p/how-a-bash-free-data-agent-remembers)
+- [The Bugs Hidden Inside a Data Agent Harness](https://maxkskhor.substack.com/p/the-engineering-invariants-behind)
+---
+## Why no bash?
+Giving an agent shell access is the path of least resistance, but it creates real problems in production: unpredictable side effects, security exposure, and behaviour that's hard to reproduce. `dataact` deliberately constrains the model to Python only — which turns out to be enough for most data workloads and forces cleaner tool design.
+---
+## Core design decisions
+Each decision here is intentional. Understanding them is the point.
+**Handle/snapshot pattern**
+Large objects (DataFrames, arrays, query results) live in a `SessionCache`, not in message history. The model only sees a compact snapshot — shape, columns, a few sample rows. It accesses the data by writing Python against the handle name. This keeps context lean without hiding data from the model.
+**Prefix-stable system prompt**
+The system prompt never changes between turns. Reminders, state, and nags are appended to the conversation suffix. This is a KV-cache discipline: a stable prefix means the provider can cache it, which reduces latency and cost on long runs.
+**Progressive connector disclosure**
+Data connectors (databases, APIs, warehouses) are registered but hidden from the tool list until explicitly loaded. A shorter tool list means the model makes better routing decisions. Connectors are only visible when relevant.
+**Subagent isolation**
+Spawned subagents get a fresh adapter and a fresh cache. State is transferred explicitly via `input_handles`. No implicit shared state. This makes subagent behaviour reproducible and debuggable.
+**Suffix-only nag reminders**
+The planner escalates reminders at 4 / 8 / 12 turns without progress. These are always appended to the suffix, never inserted into the prefix, so the KV cache is never busted by reminder text.
+**JSONL turn logging**
+Every turn is logged to a `.jsonl` file from the start. Not bolted on later. Each line is a complete turn record including latency, token counts, and cache hit/miss. Reproducibility is a first-class concern.
+---
+## Install
+```bash
+# requires Python 3.10+ and uv
+uv sync
+```
+## Quick start
+`Agent` needs a provider adapter. The adapter is the boundary between the
+provider SDK and the harness: it turns Anthropic/OpenAI responses into
+`dataact`'s normalised `Message`, `ToolUseBlock`, and token-count types. It is
+explicit on purpose so the harness is not tied to one model provider, and tests
+can swap in `FakeAdapter` without touching the loop.
+For Anthropic:
+```python
+from dataact import Agent
+from dataact.providers.anthropic import AnthropicAdapter
+adapter = AnthropicAdapter(model="claude-sonnet-4-6")
+agent = Agent(adapter=adapter, system="You are a data analyst.")
+result = agent.run("Compute the mean of [1, 2, 3, 4, 5] and print it.")
+print(result)
+```
+For OpenAI, install the optional extra and change only the adapter:
+```bash
+pip install "dataact[openai]"
+```
+```python
+from dataact.providers.openai import OpenAIAdapter
+adapter = OpenAIAdapter(model="gpt-4o-mini")
+```
+Run the minimal Anthropic example:
+```bash
+uv run python examples/quickstart.py
+```
+`examples/quickstart.py` requires `ANTHROPIC_API_KEY` when run as a script. Tests import `build_agent()` and drive it with `FakeAdapter`, so the example stays covered without token spend.
+## Connector example
+Connector helpers keep the quick path small while preserving progressive disclosure. Connector tools start hidden; the model must call `load_connectors` before it can use them.
+```python
+from dataact import Agent
+from dataact.providers.anthropic import AnthropicAdapter
+adapter = AnthropicAdapter(model="claude-sonnet-4-6")
+agent = Agent(adapter=adapter, system="You are a data analyst.")
+market_data = agent.connector(
+    "market_data",
+    description="Market data tools.",
+)
+def fetch_ohlcv(symbol: str) -> list[dict]:
+    return [{"symbol": symbol, "close": 101.2}]
+market_data.tool(
+    fetch_ohlcv,
+    description="Fetch OHLCV data for a ticker.",
+)
+result = agent.run("Load market_data and inspect AAPL.")
+print(result)
+```
+## What `Agent` composes
+`Agent` is a thin composition layer over the lower-level primitives:
+- A provider adapter translates model-provider SDK objects into the harness's normalised response types.
+- `Harness` owns the ReAct loop, messages, dispatch, reminders, and JSONL logging.
+- `SessionCache` stores large values as handles plus compact snapshots.
+- `python_interpreter` is the controlled execution surface; there is no bash tool.
+- `list_variables` exposes cache handles without dumping raw payloads.
+- `ConnectorRegistry` keeps connector tools hidden until loaded.
+- `Planner` reminders and subagents are opt-in helpers, not a second runtime.
+For the explicit wiring, read [examples/advanced_wiring.py](examples/advanced_wiring.py). It deliberately shows the moving parts that `Agent` composes.
+Run the advanced example - it loads a checked-in FRED unemployment-rate sample, runs analysis, uses subagents and the planner (requires `ANTHROPIC_API_KEY`):
+```bash
+uv run python examples/advanced_wiring.py
+```
+Run tests:
+```bash
+uv run pytest tests/ -v
+uv run pytest tests/ -m live -v  # requires provider API keys
+```
+---
+## Project structure
+```
+dataact/
+  loop.py          # Harness: the core ReAct loop
+  cache.py         # SessionCache: handle/snapshot storage
+  providers/       # Normalised adapter interface (Anthropic and OpenAI)
+  tools/
+    interpreter.py # Sandboxed Python executor
+    connectors.py  # Progressive connector registry
+    planner.py     # Plan/nag tool
+    subagent.py    # Isolated subagent spawning
+    variables.py   # list_variables tool
+  types.py         # Shared types: Message, ToolSpec, ContentBlock
+  logger.py        # JSONL turn logging
+  observe.py       # Latency measurement
+examples/
+  quickstart.py        # Minimal Agent path
+  advanced_wiring.py   # Explicit Harness wiring
+  data/                # Small public sample data for the advanced demo
+```
+---
+## Sandbox disclaimer
+The Python interpreter uses AST checks and restricted globals to reduce accidental misuse. It is not a container sandbox and should not be treated as safe for untrusted input.
+---
+## License
+MIT

dataact-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,26 @@
+dataact/__init__.py,sha256=D9fE6RPw5_QvcWm0LuGYMFt7gNwM5Q-I8zPlLOxEikY,637
+dataact/agent.py,sha256=ozA96jxOZRuKCMTXTbsvXAhuN0YiiSj-pzAOYFaATdI,7914
+dataact/cache.py,sha256=ompG180d4M-7IScsCNT4TYsXuUrWJfl7vsSg1qAkYos,10617
+dataact/exceptions.py,sha256=EP1by_xnQ7R02iNv7555qgS01faUK0yukEk7TVaL2ac,501
+dataact/format.py,sha256=vdXxU59U6mSl8V2Ez2f9KW8BZyvhfbf0A5pHxJm5ztg,3038
+dataact/logger.py,sha256=5b2emK8Z1M4TmirVl1gM-jStQLHygNqzjD-hbMAc3bU,2092
+dataact/loop.py,sha256=gLAiFmtDhq2YqLKvr-PNHE5_A23OuOSHiaFz0t-Zohs,5458
+dataact/observe.py,sha256=COUTkw4gYzyIz0wNSMGsM99fHu3QXm5TY0zWS0awLjA,642
+dataact/schema.py,sha256=Hc-NSPR5zS79M3TY6QBgj4yMJrdIMgVvRUmWF8FXm5A,2833
+dataact/serialize.py,sha256=31JUDwmTagBsrmdp8TD44Xlemh0fF8huZbMDsE4e7Lo,3284
+dataact/testing.py,sha256=EzLqAJrlbXcZzvyjJnJEwiHNH6CIdczgw4a0EWS2jUA,2077
+dataact/types.py,sha256=_jQEf4HoOiVFRpKANelRw-z5K5hh4E_bAW5DSg7rQLc,1071
+dataact/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dataact/providers/anthropic.py,sha256=ReOpc25r6uZRdwOc6qxX2ab8txmi2uvTp_KBtJ3Ftvs,3976
+dataact/providers/base.py,sha256=NvQcx_-5436u8HbzJpyve2ULZlq3ZdfYyxZlpQGOwjk,770
+dataact/providers/openai.py,sha256=wznDOOoM1rwcbWU8AV1t-NQUChwkxpU4QpDGcoNccQg,4336
+dataact/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dataact/tools/connectors.py,sha256=q7teEleaZa1abXG2ZW8XL_Wt5if7OtHLI0LfH8K70Rk,4672
+dataact/tools/interpreter.py,sha256=p1fOa4VdexXbOHDuqQk16_6hk1Ggp4Ds1bwYHZgi0Cg,5683
+dataact/tools/planner.py,sha256=YIJsrNlCpbf0Pc7jAeLKaMuS1miOgCgVffYV8FZuIQc,3611
+dataact/tools/subagent.py,sha256=wHZOwLwDaD9yLkNLvn20DxFrEREc2b40I_3Sb_iIrgI,7745
+dataact/tools/variables.py,sha256=o46VupCROipKE9SSJK9ECEzJYZqEBa-qCK8WLfSD8ik,813
+dataact-0.1.0.dist-info/METADATA,sha256=wbrOkRmnxaEcDAFztwTCGpwgjfExNEwJL-7iSXVO3rk,8076
+dataact-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+dataact-0.1.0.dist-info/licenses/LICENSE,sha256=PC5ladx3ylrBqW0euBuh1mBL9D5xXdw03MpFWq0KjmM,1065
+dataact-0.1.0.dist-info/RECORD,,

dataact-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

dataact-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Max Khor
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.