dataact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataact/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ from dataact.agent import Agent
2
+ from dataact.exceptions import (
3
+ MaxTurnsExceeded,
4
+ SubagentRecursionError,
5
+ ToolNotFoundError,
6
+ )
7
+ from dataact.providers.base import NormalizedResponse, ProviderAdapter, StopReason
8
+ from dataact.types import (
9
+ ContentBlock,
10
+ Message,
11
+ TextBlock,
12
+ ToolResultBlock,
13
+ ToolSpec,
14
+ ToolUseBlock,
15
+ )
16
+
17
+ __all__ = [
18
+ "Agent",
19
+ "ContentBlock",
20
+ "MaxTurnsExceeded",
21
+ "Message",
22
+ "NormalizedResponse",
23
+ "ProviderAdapter",
24
+ "StopReason",
25
+ "SubagentRecursionError",
26
+ "TextBlock",
27
+ "ToolNotFoundError",
28
+ "ToolResultBlock",
29
+ "ToolSpec",
30
+ "ToolUseBlock",
31
+ ]
dataact/agent.py ADDED
@@ -0,0 +1,237 @@
1
+ """High-level `Agent` convenience layer.
2
+
3
+ `Agent` is a thin composition over `Harness`, `SessionCache`, and the built-in
4
+ tools. It exists so the quick-start example reads cleanly. The low-level
5
+ primitives remain the canonical teaching surface — `agent.explain()` returns a
6
+ sketch of the equivalent explicit wiring.
7
+
8
+ `Agent.run()` is one-shot: each call builds a fresh `Harness` and starts with a
9
+ new message history. It is not a chat session.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from collections.abc import Callable
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from dataact.cache import SessionCache
20
+ from dataact.loop import Harness
21
+ from dataact.providers.base import ProviderAdapter
22
+ from dataact.schema import infer_input_schema
23
+ from dataact.tools.connectors import ConnectorRegistry
24
+ from dataact.tools.interpreter import PythonInterpreter
25
+ from dataact.tools.planner import Planner
26
+ from dataact.tools.subagent import make_subagent_spec
27
+ from dataact.tools.variables import make_list_variables_spec
28
+ from dataact.types import ToolSpec
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class _ConnectorToolDefinition:
33
+ connector_name: str
34
+ fn: Callable[..., Any]
35
+ description: str
36
+ input_schema: dict
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class _ConnectorDefinition:
41
+ name: str
42
+ description: str
43
+
44
+
45
+ class ConnectorBuilder:
46
+ def __init__(self, agent: Agent, name: str) -> None:
47
+ self._agent = agent
48
+ self._name = name
49
+
50
+ def tool(
51
+ self,
52
+ fn: Callable[..., Any],
53
+ *,
54
+ description: str,
55
+ input_schema: dict | None = None,
56
+ ) -> Callable[..., Any]:
57
+ schema = input_schema if input_schema is not None else infer_input_schema(fn)
58
+ self._agent._connector_tools.append(
59
+ _ConnectorToolDefinition(
60
+ connector_name=self._name,
61
+ fn=fn,
62
+ description=description,
63
+ input_schema=schema,
64
+ )
65
+ )
66
+ return fn
67
+
68
+
69
+ class Agent:
70
+ def __init__(
71
+ self,
72
+ adapter: ProviderAdapter,
73
+ system: str,
74
+ *,
75
+ max_turns: int = 25,
76
+ cache: SessionCache | None = None,
77
+ run_dir: str | Path | None = None,
78
+ ) -> None:
79
+ self._adapter = adapter
80
+ self._system = system
81
+ self._max_turns = max_turns
82
+ self._cache = cache if cache is not None else SessionCache()
83
+ self._run_dir = run_dir
84
+ self._last_harness: Harness | None = None
85
+ self._last_run_file: str | None = None
86
+ self._connectors: dict[str, _ConnectorDefinition] = {}
87
+ self._connector_tools: list[_ConnectorToolDefinition] = []
88
+ self._planner_enabled = False
89
+ self._subagent_factory: Callable[[], ProviderAdapter] | None = None
90
+
91
+ @property
92
+ def cache(self) -> SessionCache:
93
+ return self._cache
94
+
95
+ @property
96
+ def last_harness(self) -> Harness | None:
97
+ return self._last_harness
98
+
99
+ @property
100
+ def last_run_file(self) -> str | None:
101
+ return self._last_run_file
102
+
103
+ def connector(self, name: str, *, description: str) -> ConnectorBuilder:
104
+ self._connectors[name] = _ConnectorDefinition(
105
+ name=name,
106
+ description=description,
107
+ )
108
+ return ConnectorBuilder(self, name)
109
+
110
+ def enable_planner(self) -> Agent:
111
+ self._planner_enabled = True
112
+ return self
113
+
114
+ def enable_subagents(
115
+ self, *, adapter_factory: Callable[[], ProviderAdapter]
116
+ ) -> Agent:
117
+ self._subagent_factory = adapter_factory
118
+ return self
119
+
120
+ def run(self, user_message: str) -> str:
121
+ planner = Planner() if self._planner_enabled else None
122
+ tools = self._build_tools(planner=planner)
123
+ if self._subagent_factory is not None:
124
+ subagent_parent_tools = self._build_tools(planner=None)
125
+ effective_run_dir = (
126
+ str(self._run_dir) if self._run_dir is not None else "./runs"
127
+ )
128
+ tools.append(
129
+ make_subagent_spec(
130
+ adapter_factory=self._subagent_factory,
131
+ parent_tools=subagent_parent_tools,
132
+ parent_cache=self._cache,
133
+ run_dir=effective_run_dir,
134
+ make_sub_tools=lambda sub_cache: self._build_tools(
135
+ planner=None,
136
+ cache=sub_cache,
137
+ ),
138
+ )
139
+ )
140
+ harness_kwargs: dict = {
141
+ "adapter": self._adapter,
142
+ "system": self._system,
143
+ "tools": tools,
144
+ "max_turns": self._max_turns,
145
+ "cache": self._cache,
146
+ }
147
+ if self._run_dir is not None:
148
+ harness_kwargs["run_dir"] = str(self._run_dir)
149
+
150
+ harness = Harness(**harness_kwargs)
151
+ if planner is not None:
152
+ harness.register_reminder(planner.reminder_hook)
153
+ self._last_harness = harness
154
+ result = harness.run(user_message)
155
+ # Newest jsonl in the run dir belongs to this run
156
+ run_dir = Path(harness._run_dir)
157
+ files = sorted(run_dir.glob("*.jsonl"), key=lambda f: f.stat().st_mtime)
158
+ if files:
159
+ self._last_run_file = str(files[-1])
160
+ return result
161
+
162
+ def explain(self) -> str:
163
+ return _EXPLAIN_TEMPLATE.format(
164
+ system=_truncate(self._system),
165
+ max_turns=self._max_turns,
166
+ run_dir=self._run_dir if self._run_dir is not None else "./runs",
167
+ )
168
+
169
+ def _build_tools(
170
+ self,
171
+ *,
172
+ planner: Planner | None = None,
173
+ cache: SessionCache | None = None,
174
+ ) -> list[ToolSpec]:
175
+ target_cache = cache if cache is not None else self._cache
176
+ tools = [
177
+ PythonInterpreter.make_tool_spec(target_cache),
178
+ make_list_variables_spec(target_cache),
179
+ ]
180
+ if planner is not None:
181
+ tools.extend(planner.make_tool_specs())
182
+ if self._connectors:
183
+ registry = ConnectorRegistry()
184
+ for connector_name, connector in self._connectors.items():
185
+ registry.register(
186
+ name=connector_name,
187
+ description=connector.description,
188
+ tools=[
189
+ ToolSpec(
190
+ name=f"{connector_name}__{definition.fn.__name__}",
191
+ description=definition.description,
192
+ input_schema=definition.input_schema,
193
+ handler=definition.fn,
194
+ visible=False,
195
+ )
196
+ for definition in self._connector_tools
197
+ if definition.connector_name == connector_name
198
+ ],
199
+ )
200
+ tools.append(registry.get_load_connectors_spec())
201
+ tools.extend(registry.make_wrapped_specs(target_cache))
202
+ return tools
203
+
204
+
205
+ _EXPLAIN_TEMPLATE = """\
206
+ Agent is a thin composition layer. The equivalent explicit wiring is:
207
+
208
+ from dataact.cache import SessionCache
209
+ from dataact.loop import Harness
210
+ from dataact.tools.interpreter import PythonInterpreter
211
+ from dataact.tools.variables import make_list_variables_spec
212
+
213
+ cache = SessionCache()
214
+ tools = [
215
+ PythonInterpreter.make_tool_spec(cache),
216
+ make_list_variables_spec(cache),
217
+ ]
218
+ harness = Harness(
219
+ adapter=adapter,
220
+ system={system!r},
221
+ tools=tools,
222
+ max_turns={max_turns},
223
+ run_dir={run_dir!r},
224
+ cache=cache,
225
+ )
226
+ harness.run(user_message)
227
+
228
+ Each call to Agent.run() builds a fresh Harness with fresh tool specs.
229
+ Model-visible tools include python_interpreter and list_variables.
230
+ The message history resets per run; this is not a chat session.
231
+ """
232
+
233
+
234
+ def _truncate(text: str, limit: int = 80) -> str:
235
+ if len(text) <= limit:
236
+ return text
237
+ return text[: limit - 1] + "…"
dataact/cache.py ADDED
@@ -0,0 +1,319 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import keyword
5
+ import pickle
6
+ import re
7
+ import tempfile
8
+ from collections import OrderedDict
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ _VALID_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
14
+
15
+
16
+ def _is_valid_identifier(name: str) -> bool:
17
+ return bool(_VALID_IDENTIFIER.match(name)) and not keyword.iskeyword(name)
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class _ColdEntry:
22
+ path: Path
23
+ storage_type: str
24
+
25
+
26
+ class SessionCache:
27
+ def __init__(
28
+ self,
29
+ sample_size: int = 5,
30
+ storage_dir: str | Path | None = None,
31
+ hot_limit: int | None = None,
32
+ ) -> None:
33
+ if hot_limit is not None and hot_limit < 1:
34
+ raise ValueError("hot_limit must be at least 1")
35
+ self.sample_size = sample_size
36
+ self.hot_limit = hot_limit
37
+ self._store: dict[str, Any] = {}
38
+ self._cold: dict[str, _ColdEntry] = {}
39
+ self._snapshots: dict[str, str] = {}
40
+ self._recency: OrderedDict[str, None] = OrderedDict()
41
+ self._temp_dir: tempfile.TemporaryDirectory[str] | None = None
42
+ if storage_dir is None and hot_limit is not None:
43
+ self._temp_dir = tempfile.TemporaryDirectory(prefix="dataact-cache-")
44
+ self._storage_dir = Path(self._temp_dir.name)
45
+ elif storage_dir is not None:
46
+ self._storage_dir = Path(storage_dir)
47
+ self._storage_dir.mkdir(parents=True, exist_ok=True)
48
+ if self.hot_limit is None:
49
+ # Supplying storage_dir opts into disk-backed cache behaviour.
50
+ # Keep the default bounded so a caller does not create a spill
51
+ # directory that is never used.
52
+ self.hot_limit = 10
53
+ else:
54
+ self._storage_dir = None
55
+
56
+ def put(self, name: str, value: Any, overwrite: bool = False) -> str:
57
+ if not _is_valid_identifier(name):
58
+ raise ValueError(
59
+ f"Invalid handle name: {name!r}. Must be a valid Python identifier."
60
+ )
61
+ if overwrite or not self.has_handle(name):
62
+ if overwrite:
63
+ self._delete_cold(name)
64
+ self._put_resolved(name, value)
65
+ return name
66
+ # Auto-suffix on collision
67
+ suffix = 2
68
+ while True:
69
+ candidate = f"{name}_{suffix}"
70
+ if not self.has_handle(candidate):
71
+ self._put_resolved(candidate, value)
72
+ return candidate
73
+ suffix += 1
74
+
75
+ def get(self, name: str) -> Any:
76
+ if name in self._store:
77
+ self._mark_recent(name)
78
+ return self._store[name]
79
+ if name in self._cold:
80
+ value = self._read_cold(name)
81
+ self._delete_cold(name)
82
+ self._store[name] = value
83
+ self._mark_recent(name)
84
+ self._enforce_hot_limit()
85
+ return value
86
+ raise KeyError(name)
87
+
88
+ def snapshot(self, handle: str) -> str:
89
+ if handle in self._snapshots:
90
+ return self._snapshots[handle]
91
+ value = self.get(handle)
92
+ snapshot = self._make_snapshot(value)
93
+ self._snapshots[handle] = snapshot
94
+ return snapshot
95
+
96
+ def list_handles(self) -> dict[str, str]:
97
+ return {name: self.snapshot(name) for name in self.handle_names()}
98
+
99
+ def handle_names(self) -> list[str]:
100
+ return list(self._recency.keys())
101
+
102
+ def has_handle(self, name: str) -> bool:
103
+ return name in self._store or name in self._cold
104
+
105
+ def items(self):
106
+ for name in self.handle_names():
107
+ yield name, self.get(name)
108
+
109
+ def storage_metadata(
110
+ self, *, include_paths: bool = False
111
+ ) -> dict[str, dict[str, str]]:
112
+ metadata = {}
113
+ for name in self.handle_names():
114
+ if name in self._cold:
115
+ entry = self._cold[name]
116
+ metadata[name] = {
117
+ "location": "disk",
118
+ "storage_type": entry.storage_type,
119
+ }
120
+ if include_paths:
121
+ metadata[name]["path"] = str(entry.path)
122
+ else:
123
+ metadata[name] = {"location": "memory", "storage_type": "hot"}
124
+ return metadata
125
+
126
+ def delete(self, name: str) -> None:
127
+ if not self.has_handle(name):
128
+ raise KeyError(name)
129
+ self._store.pop(name, None)
130
+ self._delete_cold(name)
131
+ self._snapshots.pop(name, None)
132
+ self._recency.pop(name, None)
133
+
134
+ def close(self) -> None:
135
+ if self._temp_dir is not None:
136
+ self._temp_dir.cleanup()
137
+ self._temp_dir = None
138
+
139
+ def __del__(self) -> None:
140
+ self.close()
141
+
142
+ def _put_resolved(self, name: str, value: Any) -> None:
143
+ self._store[name] = value
144
+ self._snapshots[name] = self._make_snapshot(value)
145
+ self._mark_recent(name)
146
+ self._enforce_hot_limit()
147
+
148
+ def _mark_recent(self, name: str) -> None:
149
+ self._recency[name] = None
150
+ self._recency.move_to_end(name)
151
+
152
+ def _enforce_hot_limit(self) -> None:
153
+ if self._storage_dir is None or self.hot_limit is None:
154
+ return
155
+ while len(self._store) > self.hot_limit:
156
+ for candidate in self._recency:
157
+ if candidate in self._store:
158
+ self._spill(candidate)
159
+ break
160
+ else:
161
+ break
162
+
163
+ def _spill(self, name: str) -> None:
164
+ value = self._store.pop(name)
165
+ self._cold[name] = self._write_cold(name, value)
166
+
167
+ def _write_cold(self, name: str, value: Any) -> _ColdEntry:
168
+ if self._storage_dir is None:
169
+ raise RuntimeError("storage_dir is required for disk-backed cache")
170
+
171
+ try:
172
+ import numpy as np
173
+
174
+ if isinstance(value, np.ndarray) and value.dtype != object:
175
+ path = self._storage_dir / f"{name}.npy"
176
+ with path.open("wb") as fh:
177
+ np.save(fh, value, allow_pickle=False)
178
+ return _ColdEntry(path=path, storage_type="numpy_npy")
179
+ except ImportError:
180
+ pass
181
+
182
+ try:
183
+ import pandas as pd
184
+
185
+ if isinstance(value, pd.DataFrame):
186
+ parquet_path = self._storage_dir / f"{name}.parquet"
187
+ try:
188
+ value.to_parquet(parquet_path, index=False)
189
+ return _ColdEntry(
190
+ path=parquet_path,
191
+ storage_type="dataframe_parquet",
192
+ )
193
+ except (ImportError, TypeError, ValueError):
194
+ # Parquet is the preferred teaching path, but pyarrow /
195
+ # fastparquet are not core dependencies for this reference
196
+ # implementation. Fall back explicitly rather than adding a
197
+ # heavy storage dependency to the default install.
198
+ pass
199
+
200
+ path = self._storage_dir / f"{name}.pkl"
201
+ value.to_pickle(path)
202
+ return _ColdEntry(path=path, storage_type="dataframe_pickle")
203
+ except ImportError:
204
+ pass
205
+
206
+ path = self._storage_dir / f"{name}.pickle"
207
+ with path.open("wb") as fh:
208
+ pickle.dump(value, fh, protocol=pickle.HIGHEST_PROTOCOL)
209
+ return _ColdEntry(path=path, storage_type="pickle")
210
+
211
+ def _read_cold(self, name: str) -> Any:
212
+ entry = self._cold[name]
213
+ if entry.storage_type == "numpy_npy":
214
+ import numpy as np
215
+
216
+ with entry.path.open("rb") as fh:
217
+ return np.load(fh, allow_pickle=False)
218
+ if entry.storage_type == "dataframe_parquet":
219
+ import pandas as pd
220
+
221
+ return pd.read_parquet(entry.path)
222
+ if entry.storage_type in {"dataframe_pickle", "pandas_pickle"}:
223
+ import pandas as pd
224
+
225
+ return pd.read_pickle(entry.path)
226
+ with entry.path.open("rb") as fh:
227
+ return pickle.load(fh)
228
+
229
+ def _delete_cold(self, name: str) -> None:
230
+ entry = self._cold.pop(name, None)
231
+ if entry is not None:
232
+ try:
233
+ entry.path.unlink()
234
+ except FileNotFoundError:
235
+ pass
236
+
237
+ def _make_snapshot(self, value: Any) -> str:
238
+ try:
239
+ import pandas as pd
240
+
241
+ if isinstance(value, pd.DataFrame):
242
+ return self._snapshot_dataframe(value)
243
+ except ImportError:
244
+ pass
245
+
246
+ try:
247
+ import numpy as np
248
+
249
+ if isinstance(value, np.ndarray):
250
+ return self._snapshot_ndarray(value)
251
+ except ImportError:
252
+ pass
253
+
254
+ if isinstance(value, list):
255
+ return self._snapshot_list(value)
256
+ if isinstance(value, dict):
257
+ return self._snapshot_dict(value)
258
+ # Scalar
259
+ return f"value: {value!r}"
260
+
261
+ def _snapshot_dataframe(self, df) -> str:
262
+ cols = list(df.columns)
263
+ shape = list(df.shape)
264
+ sample = df.head(self.sample_size).to_dict(orient="records")
265
+ return json.dumps(
266
+ {
267
+ "type": "dataframe",
268
+ "shape": shape,
269
+ "columns": cols,
270
+ "sample": sample,
271
+ },
272
+ default=str,
273
+ )
274
+
275
+ def _snapshot_ndarray(self, arr) -> str:
276
+ flat = arr.flat
277
+ sample = [
278
+ x.item() if hasattr(x, "item") else x
279
+ for _, x in zip(range(self.sample_size), flat)
280
+ ]
281
+ return json.dumps(
282
+ {
283
+ "type": "ndarray",
284
+ "shape": list(arr.shape),
285
+ "dtype": str(arr.dtype),
286
+ "sample": sample,
287
+ }
288
+ )
289
+
290
+ def _snapshot_list(self, lst: list) -> str:
291
+ sample = lst[: self.sample_size]
292
+ try:
293
+ sample_json = json.dumps(sample)
294
+ sample_value = json.loads(sample_json)
295
+ except Exception:
296
+ sample_value = repr(sample)
297
+ return json.dumps(
298
+ {
299
+ "type": "list",
300
+ "length": len(lst),
301
+ "sample": sample_value,
302
+ }
303
+ )
304
+
305
+ def _snapshot_dict(self, d: dict) -> str:
306
+ keys = list(d.keys())[: self.sample_size]
307
+ sample = {k: d[k] for k in keys}
308
+ try:
309
+ sample_str = json.dumps(sample, default=repr)
310
+ except Exception:
311
+ sample_str = repr(sample)
312
+ return json.dumps(
313
+ {
314
+ "type": "dict",
315
+ "total_keys": len(d),
316
+ "sample_keys": keys,
317
+ "sample": json.loads(sample_str),
318
+ }
319
+ )
dataact/exceptions.py ADDED
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from dataact.providers.base import NormalizedResponse
7
+
8
+
9
+ class MaxTurnsExceeded(RuntimeError):
10
+ def __init__(self, turns: int, last_response: "NormalizedResponse | None" = None):
11
+ self.turns = turns
12
+ self.last_response = last_response
13
+ super().__init__(f"Max turns exceeded: {turns}")
14
+
15
+
16
+ class ToolNotFoundError(KeyError):
17
+ pass
18
+
19
+
20
+ class SubagentRecursionError(RuntimeError):
21
+ pass