@ekairos/dataset 1.22.55-beta.development.0 → 1.22.57-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,4 +49,9 @@ export declare function didCompleteDatasetSucceed(event: {
49
49
  parts?: any[];
50
50
  };
51
51
  }): boolean;
52
+ export declare function getDatasetFatalFailure(event: {
53
+ content?: {
54
+ parts?: any[];
55
+ };
56
+ }): string | null;
52
57
  export {};
@@ -1,6 +1,6 @@
1
1
  import { tool } from "ai";
2
2
  import { z } from "zod";
3
- import { readDatasetSandboxFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps.js";
3
+ import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps.js";
4
4
  import Ajv from "ajv";
5
5
  import { getDatasetOutputPath, } from "./datasetFiles.js";
6
6
  import { datasetGetByIdStep, datasetUpdateStatusStep, datasetUploadOutputFileStep } from "./dataset/steps.js";
@@ -113,9 +113,12 @@ export function createCompleteDatasetTool({ datasetId, sandboxId, runtime }) {
113
113
  message: "Empty file content",
114
114
  };
115
115
  }
116
- const fileBuffer = Buffer.from(fileRead.contentBase64, "base64");
117
116
  console.log(`[Dataset ${datasetId}] Uploading file to InstantDB storage`);
118
- const uploadResult = await datasetUploadOutputFileStep({ runtime, datasetId, fileBuffer });
117
+ const uploadResult = await datasetUploadOutputFileStep({
118
+ runtime,
119
+ datasetId,
120
+ contentBase64: fileRead.contentBase64,
121
+ });
119
122
  if (!uploadResult.ok) {
120
123
  console.error(`[Dataset ${datasetId}] File upload failed: ${uploadResult.error}`);
121
124
  return {
@@ -176,6 +179,31 @@ export function didCompleteDatasetSucceed(event) {
176
179
  return false;
177
180
  });
178
181
  }
182
+ export function getDatasetFatalFailure(event) {
183
+ const parts = Array.isArray(event?.content?.parts) ? event.content.parts : [];
184
+ for (const part of parts) {
185
+ let actionName;
186
+ let output;
187
+ if (part?.type === "action") {
188
+ actionName = part.content?.actionName;
189
+ output = part.content?.output;
190
+ }
191
+ else if (typeof part?.type === "string" && part.type.startsWith("tool-")) {
192
+ actionName = part.type.slice("tool-".length);
193
+ output = part.output ?? part.result;
194
+ }
195
+ if (!output || output.fatal !== true) {
196
+ continue;
197
+ }
198
+ const message = typeof output.error === "string" && output.error.trim()
199
+ ? output.error.trim()
200
+ : typeof output.message === "string" && output.message.trim()
201
+ ? output.message.trim()
202
+ : "Dataset action failed fatally";
203
+ return actionName ? `${actionName}: ${message}` : message;
204
+ }
205
+ return null;
206
+ }
179
207
  async function ensureFileExists(runtime, sandboxId, path) {
180
208
  const result = await runDatasetSandboxCommandStep({
181
209
  runtime,
@@ -192,8 +220,8 @@ async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, da
192
220
  let validRowCount = 0;
193
221
  let rowRecordCount = 0;
194
222
  console.log(`[Dataset ${datasetId}] Reading and validating JSONL file from sandbox`);
195
- const fileRead = await readDatasetSandboxFileStep({ runtime, sandboxId, path: outputPath });
196
- if (!fileRead.contentBase64) {
223
+ const fileRead = await readDatasetSandboxTextFileStep({ runtime, sandboxId, path: outputPath });
224
+ if (!fileRead.content) {
197
225
  console.log(`[Dataset ${datasetId}] Empty output file`);
198
226
  return {
199
227
  success: false,
@@ -205,8 +233,7 @@ async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, da
205
233
  message: "output.jsonl is empty",
206
234
  };
207
235
  }
208
- const fileContent = Buffer.from(fileRead.contentBase64, "base64").toString();
209
- const lines = fileContent.split("\n");
236
+ const lines = fileRead.content.split("\n");
210
237
  console.log(`[Dataset ${datasetId}] Validating ${lines.length} lines`);
211
238
  for (let index = 0; index < lines.length; index++) {
212
239
  const line = lines[index];
@@ -18,7 +18,7 @@ export declare function datasetUpdateSchemaStep(params: {
18
18
  export declare function datasetUploadOutputFileStep(params: {
19
19
  runtime: any;
20
20
  datasetId: string;
21
- fileBuffer: Buffer;
21
+ contentBase64: string;
22
22
  }): Promise<import("../service.js").ServiceResult<{
23
23
  fileId: string;
24
24
  storagePath: string;
@@ -60,7 +60,7 @@ export async function datasetUploadOutputFileStep(params) {
60
60
  const service = new DatasetService(db);
61
61
  return await service.uploadDatasetOutputFile({
62
62
  datasetId: params.datasetId,
63
- fileBuffer: params.fileBuffer,
63
+ fileBuffer: Buffer.from(params.contentBase64, "base64"),
64
64
  });
65
65
  }
66
66
  export async function datasetUpdateStatusStep(params) {
package/dist/dataset.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { id as newId } from "@instantdb/admin";
2
2
  import { buildObjectOutputInstructions } from "./builder/instructions.js";
3
- import { getDatasetAgentMaterializers } from "./builder/agentMaterializers.js";
3
+ import { materializeDerivedDataset, materializeSingleFileLikeSource, } from "./builder/materialize.js";
4
4
  import { materializeQuerySource } from "./builder/materializeQuery.js";
5
5
  import { finalizeBuildResult } from "./builder/persistence.js";
6
6
  export function dataset(runtime, options = {}) {
@@ -131,13 +131,13 @@ export function dataset(runtime, options = {}) {
131
131
  if (!effectiveState.reactor) {
132
132
  throw new Error("dataset_reactor_required");
133
133
  }
134
- await getDatasetAgentMaterializers().materializeSingleFileLikeSource(effectiveState, onlySource, targetDatasetId);
134
+ await materializeSingleFileLikeSource(effectiveState, onlySource, targetDatasetId);
135
135
  return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
136
136
  }
137
137
  if (!effectiveState.reactor) {
138
138
  throw new Error("dataset_reactor_required");
139
139
  }
140
- await getDatasetAgentMaterializers().materializeDerivedDataset(effectiveState, targetDatasetId);
140
+ await materializeDerivedDataset(effectiveState, targetDatasetId);
141
141
  return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
142
142
  },
143
143
  };
@@ -7,6 +7,20 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
7
7
  pythonCode: string;
8
8
  scriptName: string;
9
9
  }, {
10
+ success: boolean;
11
+ fatal: boolean;
12
+ status: string;
13
+ error: string;
14
+ stdout: string;
15
+ stderr: string;
16
+ exitCode: number;
17
+ scriptPath: string;
18
+ stdoutTruncated: boolean;
19
+ stderrTruncated: boolean;
20
+ stdoutOriginalLength: number;
21
+ stderrOriginalLength: number;
22
+ message?: undefined;
23
+ } | {
10
24
  success: boolean;
11
25
  exitCode: number;
12
26
  stdout: string;
@@ -17,6 +31,8 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
17
31
  stderrTruncated: boolean;
18
32
  stdoutOriginalLength: number;
19
33
  stderrOriginalLength: number;
34
+ fatal?: undefined;
35
+ status?: undefined;
20
36
  message?: undefined;
21
37
  } | {
22
38
  success: boolean;
@@ -29,6 +45,8 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
29
45
  stderrTruncated: boolean;
30
46
  stdoutOriginalLength: number;
31
47
  stderrOriginalLength: number;
48
+ fatal?: undefined;
49
+ status?: undefined;
32
50
  error?: undefined;
33
51
  }>;
34
52
  export {};
@@ -1,39 +1,81 @@
1
1
  import { tool } from "ai";
2
2
  import { z } from "zod";
3
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "./sandbox/steps.js";
3
+ import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFilesStep } from "./sandbox/steps.js";
4
4
  import { getDatasetWorkstation } from "./datasetFiles.js";
5
5
  // To keep responses predictable for big data scenarios, we cap stdout/stderr.
6
6
  // The tool's return payload exposes stdout (capped) plus the on-disk script path.
7
7
  const MAX_STDOUT_CHARS = 20000;
8
8
  const MAX_STDERR_CHARS = 5000;
9
+ function normalizeScriptName(scriptName) {
10
+ const normalized = String(scriptName ?? "")
11
+ .trim()
12
+ .replace(/[^a-zA-Z0-9_.-]/g, "_")
13
+ .replace(/_+/g, "_")
14
+ .slice(0, 80);
15
+ return normalized || "script";
16
+ }
17
+ function stableScriptHash(value) {
18
+ let hash = 2166136261;
19
+ for (let index = 0; index < value.length; index++) {
20
+ hash ^= value.charCodeAt(index);
21
+ hash = Math.imul(hash, 16777619);
22
+ }
23
+ return (hash >>> 0).toString(36);
24
+ }
9
25
  export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
10
26
  return tool({
11
27
  description: "Execute Python scripts in the sandbox. Always saves script to a file before executing. The tool's output is EXACTLY the script's stdout and includes the script file path for traceability. CRITICAL: Print concise, human-readable summaries only; do NOT print raw large data. For big results, write artifacts to files in the workstation and print their file paths. Always include progress/result prints (e.g., 'Processing file X...', 'Found Y records', 'Generated output.csv').",
12
28
  inputSchema: z.object({
13
29
  pythonCode: z.string().describe("Python code to execute. Saved to a file before running. MANDATORY: Use print() to report progress and final results. Keep prints concise; avoid dumping rows/JSON. For large outputs, write to files in the workstation directory and print only file paths and brief summaries."),
14
- scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A UUID will be appended automatically."),
30
+ scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A deterministic suffix will be appended automatically."),
15
31
  }),
16
32
  execute: async ({ pythonCode, scriptName }) => {
17
- const uuid = `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
18
33
  const workstation = getDatasetWorkstation(datasetId);
19
- const scriptFile = `${workstation}/${scriptName}-${uuid}.py`;
34
+ const normalizedScriptName = normalizeScriptName(scriptName);
35
+ const scriptHash = stableScriptHash(`${normalizedScriptName}\0${pythonCode}`);
36
+ const scriptFile = `${workstation}/${normalizedScriptName}-${scriptHash}.py`;
20
37
  console.log(`[Dataset ${datasetId}] ========================================`);
21
38
  console.log(`[Dataset ${datasetId}] Tool: executeCommand`);
22
- console.log(`[Dataset ${datasetId}] Script: ${scriptName}`);
39
+ console.log(`[Dataset ${datasetId}] Script: ${normalizedScriptName}`);
23
40
  console.log(`[Dataset ${datasetId}] File: ${scriptFile}`);
24
41
  console.log(`[Dataset ${datasetId}] Code length: ${pythonCode.length} chars`);
25
42
  console.log(`[Dataset ${datasetId}] ========================================`);
26
43
  try {
27
- await writeDatasetSandboxFilesStep({
44
+ await writeDatasetSandboxTextFilesStep({
28
45
  runtime,
29
46
  sandboxId,
30
47
  files: [
31
48
  {
32
49
  path: scriptFile,
33
- contentBase64: Buffer.from(pythonCode, "utf-8").toString("base64"),
50
+ content: pythonCode,
34
51
  },
35
52
  ],
36
53
  });
54
+ const written = await runDatasetSandboxCommandStep({
55
+ runtime,
56
+ sandboxId,
57
+ cmd: "test",
58
+ args: ["-f", scriptFile],
59
+ });
60
+ if (written.exitCode !== 0) {
61
+ const error = `Script write verification failed: ${scriptFile}`;
62
+ console.error(`[Dataset ${datasetId}] ${error}`);
63
+ console.error(`[Dataset ${datasetId}] ========================================`);
64
+ return {
65
+ success: false,
66
+ fatal: true,
67
+ status: "script_write_failed",
68
+ error,
69
+ stdout: written.stdout || "",
70
+ stderr: written.stderr || "",
71
+ exitCode: written.exitCode,
72
+ scriptPath: scriptFile,
73
+ stdoutTruncated: false,
74
+ stderrTruncated: false,
75
+ stdoutOriginalLength: 0,
76
+ stderrOriginalLength: 0,
77
+ };
78
+ }
37
79
  console.log(`[Dataset ${datasetId}] Script written to: ${scriptFile}`);
38
80
  console.log(`[Dataset ${datasetId}] Executing: python ${scriptFile}`);
39
81
  const result = await runDatasetSandboxCommandStep({
@@ -1,59 +1,6 @@
1
- import { createContext, type ContextReactor } from "@ekairos/events";
2
- import { FilePreviewContext } from "./filepreview.js";
3
- export type FileParseContext = {
4
- datasetId: string;
5
- fileId: string;
6
- instructions: string;
7
- sandboxConfig: {
8
- filePath: string;
9
- };
10
- analysis: any[];
11
- schema: any | null;
12
- plan: any | null;
13
- executionResult: any | null;
14
- errors: string[];
15
- iterationCount: number;
16
- filePreview?: FilePreviewContext;
17
- };
18
- export type FileParseContextParams = {
19
- fileId?: string;
20
- instructions?: string;
21
- sandboxId?: string;
22
- datasetId?: string;
23
- model?: string;
24
- reactor?: ContextReactor<any, any>;
25
- };
26
- export type FileParseRunOptions = {
27
- prompt?: string;
28
- durable?: boolean;
29
- };
30
- export type FileParseContextBuilder<Env extends {
31
- orgId: string;
32
- }> = {
33
- datasetId: string;
34
- context: ReturnType<ReturnType<typeof createContext<Env>>["context"]> extends any ? any : any;
35
- };
36
- export type DatasetResult = {
37
- id: string;
38
- status?: string;
39
- title?: string;
40
- schema?: any;
41
- analysis?: any;
42
- calculatedTotalRows?: number;
43
- actualGeneratedRowCount?: number;
44
- createdAt?: number;
45
- updatedAt?: number;
46
- };
47
- /**
48
- * Factory (DX-first):
49
- *
50
- * Usage:
51
- * const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
52
- *
53
- * - Uses the caller runtime; no secondary runtime is created.
54
- * - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
55
- * - `parse()` is the entrypoint; it calls `context.react(...)` internally.
56
- */
1
+ import { type ContextReactor } from "@ekairos/events";
2
+ import type { FileParseRunOptions } from "./file-dataset.types.js";
3
+ export type { DatasetResult, FileParseContext, FileParseContextBuilder, FileParseContextParams, FileParseRunOptions, SandboxState, } from "./file-dataset.types.js";
57
4
  export declare function createFileParseContext<Env extends {
58
5
  orgId: string;
59
6
  }>(fileId: string, opts?: {
@@ -1,15 +1,11 @@
1
- import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
2
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
- import { createGenerateSchemaTool } from "./generateSchema.tool.js";
4
- import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
5
- import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
- import { createClearDatasetTool } from "../clearDataset.tool.js";
7
- import { buildFileDatasetPrompt } from "./prompts.js";
8
- import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
1
+ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
9
2
  import { id } from "@instantdb/admin";
10
- import { getDatasetWorkstation } from "../datasetFiles.js";
11
- import { readInstantFileStep } from "./steps.js";
3
+ import { createClearDatasetTool } from "../clearDataset.tool.js";
4
+ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
12
5
  import { datasetGetByIdStep } from "../dataset/steps.js";
6
+ import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { createGenerateSchemaTool } from "./generateSchema.tool.js";
8
+ import { buildFileDatasetPromptStep, generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
13
9
  async function awaitContextRun(run) {
14
10
  if (!run)
15
11
  return;
@@ -19,63 +15,6 @@ async function awaitContextRun(run) {
19
15
  }
20
16
  await run;
21
17
  }
22
- async function initializeSandbox(runtime, sandboxId, datasetId, fileId, state) {
23
- "use step";
24
- if (state.initialized) {
25
- return state.filePath;
26
- }
27
- console.log(`[FileParseContext ${datasetId}] Initializing sandbox...`);
28
- await ensurePreviewScriptsAvailable(runtime, sandboxId);
29
- console.log(`[FileParseContext ${datasetId}] Installing Python dependencies...`);
30
- const pipInstall = await runDatasetSandboxCommandStep({
31
- runtime,
32
- sandboxId,
33
- cmd: "python",
34
- args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
35
- });
36
- const installStderr = pipInstall.stderr;
37
- if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
38
- throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
39
- }
40
- console.log(`[FileParseContext ${datasetId}] Fetching file from InstantDB...`);
41
- const file = await readInstantFileStep({ runtime, fileId });
42
- console.log(`[FileParseContext ${datasetId}] Creating dataset workstation...`);
43
- const workstation = getDatasetWorkstation(datasetId);
44
- await runDatasetSandboxCommandStep({
45
- runtime,
46
- sandboxId,
47
- cmd: "mkdir",
48
- args: ["-p", workstation],
49
- });
50
- const fileName = file.contentDisposition ?? "";
51
- const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
52
- const sandboxFilePath = `${workstation}/${fileId}${fileExtension}`;
53
- await writeDatasetSandboxFilesStep({
54
- runtime,
55
- sandboxId,
56
- files: [
57
- {
58
- path: sandboxFilePath,
59
- contentBase64: file.contentBase64,
60
- },
61
- ],
62
- });
63
- console.log(`[FileParseContext ${datasetId}] ✅ Workstation created: ${workstation}`);
64
- console.log(`[FileParseContext ${datasetId}] ✅ File saved: ${sandboxFilePath}`);
65
- state.filePath = sandboxFilePath;
66
- state.initialized = true;
67
- return sandboxFilePath;
68
- }
69
- /**
70
- * FileParseContext
71
- *
72
- * Uso:
73
- * - Crear una instancia con `fileId`, `instructions` y un `sandbox`
74
- * - Llamar `getDataset()` para crear un dataset nuevo (crea un datasetId interno)
75
- * - Llamar `followUp(datasetId, feedback)` para iterar el mismo dataset con feedback
76
- *
77
- * Internamente corre un Context (`createContext("file.parse")`) que itera hasta que se ejecuta el tool `completeDataset`.
78
- */
79
18
  function createFileParseContextDefinition(params) {
80
19
  const fallbackDatasetId = params.datasetId;
81
20
  const model = params.model ?? "openai/gpt-5";
@@ -96,18 +35,31 @@ function createFileParseContextDefinition(params) {
96
35
  if (!sandboxId) {
97
36
  throw new Error("dataset_sandbox_required");
98
37
  }
99
- const sandboxFilePath = await initializeSandbox(runtime, sandboxId, datasetId, fileId, sandboxState);
38
+ const initialized = await initializeFileParseSandboxStep({
39
+ runtime,
40
+ sandboxId,
41
+ datasetId,
42
+ fileId,
43
+ state: sandboxState,
44
+ });
45
+ const sandboxFilePath = initialized.filePath;
100
46
  let filePreview = undefined;
101
47
  try {
102
- filePreview = await generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId);
48
+ filePreview = await generateFileParsePreviewStep({
49
+ runtime,
50
+ sandboxId,
51
+ sandboxFilePath,
52
+ datasetId,
53
+ });
103
54
  }
104
55
  catch {
105
- // optional
56
+ // Preview is optional; parsing can still proceed from the file path.
106
57
  }
107
58
  let schema = null;
108
59
  const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
109
- if (datasetResult.ok && datasetResult.data.schema)
60
+ if (datasetResult.ok && datasetResult.data.schema) {
110
61
  schema = datasetResult.data.schema;
62
+ }
111
63
  const ctx = {
112
64
  datasetId,
113
65
  fileId,
@@ -127,13 +79,13 @@ function createFileParseContextDefinition(params) {
127
79
  fileId,
128
80
  instructions,
129
81
  sandboxId,
130
- sandboxState,
82
+ sandboxState: initialized.state,
131
83
  ctx,
132
84
  };
133
85
  })
134
86
  .narrative(async (stored) => {
135
87
  const ctx = stored?.content?.ctx;
136
- const base = buildFileDatasetPrompt(ctx);
88
+ const base = await buildFileDatasetPromptStep({ context: ctx });
137
89
  const userInstructions = String(ctx?.instructions ?? "").trim();
138
90
  if (!userInstructions)
139
91
  return base;
@@ -184,6 +136,10 @@ function createFileParseContextDefinition(params) {
184
136
  return actions;
185
137
  })
186
138
  .shouldContinue(({ reactionEvent }) => {
139
+ const fatalFailure = getDatasetFatalFailure(reactionEvent);
140
+ if (fatalFailure) {
141
+ throw new Error(fatalFailure);
142
+ }
187
143
  return !didCompleteDatasetSucceed(reactionEvent);
188
144
  });
189
145
  if (params.reactor) {
@@ -195,16 +151,6 @@ function createFileParseContextDefinition(params) {
195
151
  const context = contextBuilder.build();
196
152
  return { datasetId: fallbackDatasetId ?? "", context };
197
153
  }
198
- /**
199
- * Factory (DX-first):
200
- *
201
- * Usage:
202
- * const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
203
- *
204
- * - Uses the caller runtime; no secondary runtime is created.
205
- * - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
206
- * - `parse()` is the entrypoint; it calls `context.react(...)` internally.
207
- */
208
154
  export function createFileParseContext(fileId, opts) {
209
155
  const datasetId = opts?.datasetId ?? id();
210
156
  const params = {
@@ -225,14 +171,25 @@ export function createFileParseContext(fileId, opts) {
225
171
  channel: WEB_CHANNEL,
226
172
  createdAt: new Date().toISOString(),
227
173
  content: {
228
- parts: [{ type: "text", text: options.prompt ?? "generate a dataset for this file" }],
174
+ parts: [
175
+ {
176
+ type: "text",
177
+ text: options.prompt ?? "generate a dataset for this file",
178
+ },
179
+ ],
229
180
  },
230
181
  };
231
182
  const shell = await context.react(triggerEvent, {
232
183
  runtime: runtime,
233
184
  context: { key: `dataset:${datasetId}` },
234
185
  durable: options.durable ?? false,
235
- options: { silent: true, preventClose: true, sendFinish: false, maxIterations: 20, maxModelSteps: 5 },
186
+ options: {
187
+ silent: true,
188
+ preventClose: true,
189
+ sendFinish: false,
190
+ maxIterations: 20,
191
+ maxModelSteps: 5,
192
+ },
236
193
  __initialContent: {
237
194
  datasetId,
238
195
  fileId,
@@ -244,7 +201,6 @@ export function createFileParseContext(fileId, opts) {
244
201
  await awaitContextRun(shell.run);
245
202
  return { datasetId };
246
203
  },
247
- // Optional: expose the built context for advanced callers (not required for parse DX)
248
204
  context,
249
205
  };
250
206
  }
@@ -0,0 +1,21 @@
1
+ import type { FileParseContext, SandboxState } from "./file-dataset.types.js";
2
+ import type { FilePreviewContext } from "./filepreview.types.js";
3
+ export declare function initializeFileParseSandboxStep(params: {
4
+ runtime: any;
5
+ sandboxId: string;
6
+ datasetId: string;
7
+ fileId: string;
8
+ state: SandboxState;
9
+ }): Promise<{
10
+ filePath: string;
11
+ state: SandboxState;
12
+ }>;
13
+ export declare function generateFileParsePreviewStep(params: {
14
+ runtime: any;
15
+ sandboxId: string;
16
+ sandboxFilePath: string;
17
+ datasetId: string;
18
+ }): Promise<FilePreviewContext>;
19
+ export declare function buildFileDatasetPromptStep(params: {
20
+ context: FileParseContext;
21
+ }): Promise<string>;
@@ -0,0 +1,62 @@
1
+ import { getDatasetWorkstation } from "../datasetFiles.js";
2
+ import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
+ import { buildFileDatasetPrompt } from "./prompts.js";
4
+ import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
5
+ import { readInstantFileStep } from "./steps.js";
6
+ export async function initializeFileParseSandboxStep(params) {
7
+ "use step";
8
+ if (params.state.initialized) {
9
+ return { filePath: params.state.filePath, state: params.state };
10
+ }
11
+ console.log(`[FileParseContext ${params.datasetId}] Initializing sandbox...`);
12
+ await ensurePreviewScriptsAvailable(params.runtime, params.sandboxId);
13
+ console.log(`[FileParseContext ${params.datasetId}] Installing Python dependencies...`);
14
+ const pipInstall = await runDatasetSandboxCommandStep({
15
+ runtime: params.runtime,
16
+ sandboxId: params.sandboxId,
17
+ cmd: "python",
18
+ args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
19
+ });
20
+ const installStderr = pipInstall.stderr;
21
+ if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
22
+ throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
23
+ }
24
+ console.log(`[FileParseContext ${params.datasetId}] Fetching file from InstantDB...`);
25
+ const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
26
+ console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
27
+ const workstation = getDatasetWorkstation(params.datasetId);
28
+ await runDatasetSandboxCommandStep({
29
+ runtime: params.runtime,
30
+ sandboxId: params.sandboxId,
31
+ cmd: "mkdir",
32
+ args: ["-p", workstation],
33
+ });
34
+ const fileName = file.contentDisposition ?? "";
35
+ const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
36
+ const sandboxFilePath = `${workstation}/${params.fileId}${fileExtension}`;
37
+ await writeDatasetSandboxFilesStep({
38
+ runtime: params.runtime,
39
+ sandboxId: params.sandboxId,
40
+ files: [
41
+ {
42
+ path: sandboxFilePath,
43
+ contentBase64: file.contentBase64,
44
+ },
45
+ ],
46
+ });
47
+ console.log(`[FileParseContext ${params.datasetId}] Workstation created: ${workstation}`);
48
+ console.log(`[FileParseContext ${params.datasetId}] File saved: ${sandboxFilePath}`);
49
+ const state = {
50
+ initialized: true,
51
+ filePath: sandboxFilePath,
52
+ };
53
+ return { filePath: sandboxFilePath, state };
54
+ }
55
+ export async function generateFileParsePreviewStep(params) {
56
+ "use step";
57
+ return await generateFilePreview(params.runtime, params.sandboxId, params.sandboxFilePath, params.datasetId);
58
+ }
59
+ export async function buildFileDatasetPromptStep(params) {
60
+ "use step";
61
+ return buildFileDatasetPrompt(params.context);
62
+ }
@@ -0,0 +1,50 @@
1
+ import type { ContextReactor } from "@ekairos/events";
2
+ import type { FilePreviewContext } from "./filepreview.types.js";
3
+ export type SandboxState = {
4
+ initialized: boolean;
5
+ filePath: string;
6
+ };
7
+ export type FileParseContext = {
8
+ datasetId: string;
9
+ fileId: string;
10
+ instructions: string;
11
+ sandboxConfig: {
12
+ filePath: string;
13
+ };
14
+ analysis: any[];
15
+ schema: any | null;
16
+ plan: any | null;
17
+ executionResult: any | null;
18
+ errors: string[];
19
+ iterationCount: number;
20
+ filePreview?: FilePreviewContext;
21
+ };
22
+ export type FileParseContextParams = {
23
+ fileId?: string;
24
+ instructions?: string;
25
+ sandboxId?: string;
26
+ datasetId?: string;
27
+ model?: string;
28
+ reactor?: ContextReactor<any, any>;
29
+ };
30
+ export type FileParseRunOptions = {
31
+ prompt?: string;
32
+ durable?: boolean;
33
+ };
34
+ export type FileParseContextBuilder<Env extends {
35
+ orgId: string;
36
+ }> = {
37
+ datasetId: string;
38
+ context: any;
39
+ };
40
+ export type DatasetResult = {
41
+ id: string;
42
+ status?: string;
43
+ title?: string;
44
+ schema?: any;
45
+ analysis?: any;
46
+ calculatedTotalRows?: number;
47
+ actualGeneratedRowCount?: number;
48
+ createdAt?: number;
49
+ updatedAt?: number;
50
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -1,34 +1,5 @@
1
- export type FilePreviewContext = {
2
- totalRows: number;
3
- metadata?: {
4
- description: string;
5
- script: string;
6
- command: string;
7
- stdout: string;
8
- stderr: string;
9
- };
10
- head?: {
11
- description: string;
12
- script: string;
13
- command: string;
14
- stdout: string;
15
- stderr: string;
16
- };
17
- tail?: {
18
- description: string;
19
- script: string;
20
- command: string;
21
- stdout: string;
22
- stderr: string;
23
- };
24
- mid?: {
25
- description: string;
26
- script: string;
27
- command: string;
28
- stdout: string;
29
- stderr: string;
30
- };
31
- };
1
+ import type { FilePreviewContext } from "./filepreview.types.js";
2
+ export type { FilePreviewContext } from "./filepreview.types.js";
32
3
  interface PreviewOptions {
33
4
  headLines?: number;
34
5
  tailLines?: number;
@@ -36,4 +7,3 @@ interface PreviewOptions {
36
7
  }
37
8
  export declare function ensurePreviewScriptsAvailable(runtime: any, sandboxId: string): Promise<void>;
38
9
  export declare function generateFilePreview(runtime: any, sandboxId: string, sandboxFilePath: string, datasetId: string, options?: PreviewOptions): Promise<FilePreviewContext>;
39
- export {};
@@ -0,0 +1,31 @@
1
+ export type FilePreviewContext = {
2
+ totalRows: number;
3
+ metadata?: {
4
+ description: string;
5
+ script: string;
6
+ command: string;
7
+ stdout: string;
8
+ stderr: string;
9
+ };
10
+ head?: {
11
+ description: string;
12
+ script: string;
13
+ command: string;
14
+ stdout: string;
15
+ stderr: string;
16
+ };
17
+ tail?: {
18
+ description: string;
19
+ script: string;
20
+ command: string;
21
+ stdout: string;
22
+ stderr: string;
23
+ };
24
+ mid?: {
25
+ description: string;
26
+ script: string;
27
+ command: string;
28
+ stdout: string;
29
+ stderr: string;
30
+ };
31
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -1,2 +1,2 @@
1
- import { FileParseContext } from "./file-dataset.agent.js";
1
+ import type { FileParseContext } from "./file-dataset.types.js";
2
2
  export declare function buildFileDatasetPrompt(context: FileParseContext): string;
package/dist/index.d.ts CHANGED
@@ -3,3 +3,5 @@ export * from "./domain.js";
3
3
  export * from "./materializeDataset.tool.js";
4
4
  export * from "./schema.js";
5
5
  export * from "./service.js";
6
+ export { registerFileParseContext } from "./file/file-dataset.agent.js";
7
+ export { registerTransformDatasetContext } from "./transform/index.js";
package/dist/index.js CHANGED
@@ -3,3 +3,5 @@ export * from "./domain.js";
3
3
  export * from "./materializeDataset.tool.js";
4
4
  export * from "./schema.js";
5
5
  export * from "./service.js";
6
+ export { registerFileParseContext } from "./file/file-dataset.agent.js";
7
+ export { registerTransformDatasetContext } from "./transform/index.js";
@@ -27,6 +27,14 @@ export declare function writeDatasetSandboxFilesStep(params: {
27
27
  contentBase64: string;
28
28
  }>;
29
29
  }): Promise<void>;
30
+ export declare function writeDatasetSandboxTextFilesStep(params: {
31
+ runtime: any;
32
+ sandboxId: DatasetSandboxId;
33
+ files: Array<{
34
+ path: string;
35
+ content: string;
36
+ }>;
37
+ }): Promise<void>;
30
38
  export declare function readDatasetSandboxFileStep(params: {
31
39
  runtime: any;
32
40
  sandboxId: DatasetSandboxId;
@@ -34,6 +42,13 @@ export declare function readDatasetSandboxFileStep(params: {
34
42
  }): Promise<{
35
43
  contentBase64: string;
36
44
  }>;
45
+ export declare function readDatasetSandboxTextFileStep(params: {
46
+ runtime: any;
47
+ sandboxId: DatasetSandboxId;
48
+ path: string;
49
+ }): Promise<{
50
+ content: string;
51
+ }>;
37
52
  export declare function stopDatasetSandboxStep(params: {
38
53
  runtime: any;
39
54
  sandboxId: DatasetSandboxId;
@@ -117,6 +117,25 @@ export async function writeDatasetSandboxFilesStep(params) {
117
117
  if (!result.ok)
118
118
  throw new Error(result.error);
119
119
  }
120
+ export async function writeDatasetSandboxTextFilesStep(params) {
121
+ "use step";
122
+ if (isLocalDatasetSandboxMode()) {
123
+ for (const file of params.files) {
124
+ await fs.mkdir(path.dirname(file.path), { recursive: true });
125
+ await fs.writeFile(file.path, file.content, "utf-8");
126
+ }
127
+ return;
128
+ }
129
+ const db = await getRuntimeDb(params.runtime);
130
+ const service = new SandboxService(db);
131
+ const files = params.files.map((file) => ({
132
+ path: file.path,
133
+ contentBase64: Buffer.from(file.content, "utf-8").toString("base64"),
134
+ }));
135
+ const result = await service.writeFiles(params.sandboxId, files);
136
+ if (!result.ok)
137
+ throw new Error(result.error);
138
+ }
120
139
  export async function readDatasetSandboxFileStep(params) {
121
140
  "use step";
122
141
  if (isLocalDatasetSandboxMode()) {
@@ -130,6 +149,19 @@ export async function readDatasetSandboxFileStep(params) {
130
149
  throw new Error(result.error);
131
150
  return result.data;
132
151
  }
152
+ export async function readDatasetSandboxTextFileStep(params) {
153
+ "use step";
154
+ if (isLocalDatasetSandboxMode()) {
155
+ const content = await fs.readFile(params.path, "utf-8");
156
+ return { content };
157
+ }
158
+ const db = await getRuntimeDb(params.runtime);
159
+ const service = new SandboxService(db);
160
+ const result = await service.readFile(params.sandboxId, params.path);
161
+ if (!result.ok)
162
+ throw new Error(result.error);
163
+ return { content: Buffer.from(result.data.contentBase64, "base64").toString("utf-8") };
164
+ }
133
165
  export async function stopDatasetSandboxStep(params) {
134
166
  "use step";
135
167
  if (isLocalDatasetSandboxMode()) {
@@ -1,2 +1,2 @@
1
- export { createTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, } from "./transform-dataset.agent.js";
1
+ export { createTransformDatasetContext, registerTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, type TransformDatasetRunOptions, } from "./transform-dataset.agent.js";
2
2
  export { transformDataset, type TransformDatasetInput, type TransformDatasetResult, } from "./transformDataset.js";
@@ -1,2 +1,2 @@
1
- export { createTransformDatasetContext, } from "./transform-dataset.agent.js";
1
+ export { createTransformDatasetContext, registerTransformDatasetContext, } from "./transform-dataset.agent.js";
2
2
  export { transformDataset, } from "./transformDataset.js";
@@ -1,34 +1,2 @@
1
- export type TransformPromptContext = {
2
- datasetId: string;
3
- sourceDatasetIds: string[];
4
- outputSchema: any;
5
- sandboxConfig: {
6
- sourcePaths: Array<{
7
- datasetId: string;
8
- path: string;
9
- }>;
10
- outputPath: string;
11
- };
12
- sourcePreviews?: Array<{
13
- datasetId: string;
14
- preview: {
15
- totalRows: number;
16
- metadata?: {
17
- description: string;
18
- script: string;
19
- command: string;
20
- stdout: string;
21
- stderr: string;
22
- };
23
- head?: {
24
- description: string;
25
- script: string;
26
- command: string;
27
- stdout: string;
28
- stderr: string;
29
- };
30
- };
31
- }>;
32
- errors: string[];
33
- };
1
+ import type { TransformPromptContext } from "./transform-dataset.types.js";
34
2
  export declare function buildTransformDatasetPrompt(context: TransformPromptContext): string;
@@ -1,48 +1,6 @@
1
1
  import { type ContextReactor } from "@ekairos/events";
2
- import { TransformSourcePreviewContext } from "./filepreview.js";
3
- export type TransformDatasetContext = {
4
- datasetId: string;
5
- sourceDatasetIds: string[];
6
- outputSchema: any;
7
- sandboxConfig: {
8
- sourcePaths: Array<{
9
- datasetId: string;
10
- path: string;
11
- }>;
12
- outputPath: string;
13
- };
14
- sourcePreviews?: Array<{
15
- datasetId: string;
16
- preview: TransformSourcePreviewContext;
17
- }>;
18
- errors: string[];
19
- iterationCount: number;
20
- instructions?: string;
21
- };
22
- export type TransformDatasetAgentParams = {
23
- sourceDatasetIds?: string[];
24
- outputSchema?: any;
25
- instructions?: string;
26
- datasetId?: string;
27
- model?: string;
28
- sandboxId?: string;
29
- reactor?: ContextReactor<any, any>;
30
- };
31
- export type TransformDatasetRunOptions = {
32
- prompt?: string;
33
- durable?: boolean;
34
- };
35
- export type TransformDatasetResult = {
36
- id: string;
37
- status?: string;
38
- title?: string;
39
- schema?: any;
40
- analysis?: any;
41
- calculatedTotalRows?: number;
42
- actualGeneratedRowCount?: number;
43
- createdAt?: number;
44
- updatedAt?: number;
45
- };
2
+ import type { TransformDatasetRunOptions } from "./transform-dataset.types.js";
3
+ export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
46
4
  export declare function createTransformDatasetContext<Env extends {
47
5
  orgId: string;
48
6
  }>(params: {
@@ -1,13 +1,10 @@
1
- import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
2
- import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
3
- import { createExecuteCommandTool } from "../executeCommand.tool.js";
4
- import { createClearDatasetTool } from "../clearDataset.tool.js";
5
- import { buildTransformDatasetPrompt } from "./prompts.js";
6
- import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
1
+ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
7
2
  import { id } from "@instantdb/admin";
8
- import { generateSourcePreview } from "./filepreview.js";
9
- import { datasetReadOutputJsonlStep, datasetUpdateSchemaStep } from "../dataset/steps.js";
10
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
+ import { createClearDatasetTool } from "../clearDataset.tool.js";
4
+ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
5
+ import { datasetUpdateSchemaStep } from "../dataset/steps.js";
6
+ import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { buildTransformDatasetPromptStep, ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "./transform-dataset.steps.js";
11
8
  async function awaitContextRun(run) {
12
9
  if (!run)
13
10
  return;
@@ -17,28 +14,6 @@ async function awaitContextRun(run) {
17
14
  }
18
15
  await run;
19
16
  }
20
- async function ensureSourcesInSandbox(runtime, sandboxId, datasetId, sourceDatasetIds, state) {
21
- "use step";
22
- if (state.initialized) {
23
- return { sourcePaths: state.sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
24
- }
25
- const workstation = getDatasetWorkstation(datasetId);
26
- await runDatasetSandboxCommandStep({ runtime, sandboxId, cmd: "mkdir", args: ["-p", workstation] });
27
- const sourcePaths = [];
28
- for (const sourceDatasetId of sourceDatasetIds) {
29
- const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
30
- const source = await datasetReadOutputJsonlStep({ runtime, datasetId: sourceDatasetId });
31
- await writeDatasetSandboxFilesStep({
32
- runtime,
33
- sandboxId,
34
- files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
35
- });
36
- sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
37
- }
38
- state.sourcePaths = sourcePaths;
39
- state.initialized = true;
40
- return { sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
41
- }
42
17
  function createTransformDatasetContextDefinition(params) {
43
18
  const fallbackDatasetId = params.datasetId;
44
19
  const model = params.model ?? "openai/gpt-5";
@@ -67,18 +42,19 @@ function createTransformDatasetContextDefinition(params) {
67
42
  if (!sandboxId) {
68
43
  throw new Error("dataset_sandbox_required");
69
44
  }
70
- const { sourcePaths, outputPath } = await ensureSourcesInSandbox(runtime, sandboxId, datasetId, sourceDatasetIds, sandboxState);
71
- const sourcePreviews = [];
72
- for (const sp of sourcePaths) {
73
- try {
74
- const preview = await generateSourcePreview(runtime, sandboxId, sp.path, datasetId);
75
- sourcePreviews.push({ datasetId: sp.datasetId, preview });
76
- }
77
- catch {
78
- // optional
79
- }
80
- }
81
- // Persist output schema on the dataset record (so completeDataset validates against it)
45
+ const initialized = await ensureTransformSourcesInSandboxStep({
46
+ runtime,
47
+ sandboxId,
48
+ datasetId,
49
+ sourceDatasetIds,
50
+ state: sandboxState,
51
+ });
52
+ const sourcePreviews = await generateTransformSourcePreviewsStep({
53
+ runtime,
54
+ sandboxId,
55
+ datasetId,
56
+ sourcePaths: initialized.sourcePaths,
57
+ });
82
58
  await datasetUpdateSchemaStep({
83
59
  runtime,
84
60
  datasetId,
@@ -89,11 +65,16 @@ function createTransformDatasetContextDefinition(params) {
89
65
  datasetId,
90
66
  sourceDatasetIds,
91
67
  outputSchema,
92
- sandboxConfig: { sourcePaths, outputPath },
68
+ sandboxConfig: {
69
+ sourcePaths: initialized.sourcePaths,
70
+ outputPath: initialized.outputPath,
71
+ },
93
72
  sourcePreviews: sourcePreviews.length > 0 ? sourcePreviews : undefined,
94
73
  errors: [],
95
74
  };
96
- const basePrompt = buildTransformDatasetPrompt(promptContext);
75
+ const basePrompt = await buildTransformDatasetPromptStep({
76
+ context: promptContext,
77
+ });
97
78
  const userInstructions = String(instructions ?? "").trim();
98
79
  const system = userInstructions
99
80
  ? [
@@ -112,9 +93,12 @@ function createTransformDatasetContextDefinition(params) {
112
93
  outputSchema,
113
94
  instructions,
114
95
  sandboxId,
115
- sandboxState,
96
+ sandboxState: initialized.state,
116
97
  system,
117
- sandboxConfig: { sourcePaths, outputPath },
98
+ sandboxConfig: {
99
+ sourcePaths: initialized.sourcePaths,
100
+ outputPath: initialized.outputPath,
101
+ },
118
102
  };
119
103
  })
120
104
  .narrative(async (stored) => {
@@ -146,6 +130,10 @@ function createTransformDatasetContextDefinition(params) {
146
130
  };
147
131
  })
148
132
  .shouldContinue(({ reactionEvent }) => {
133
+ const fatalFailure = getDatasetFatalFailure(reactionEvent);
134
+ if (fatalFailure) {
135
+ throw new Error(fatalFailure);
136
+ }
149
137
  return !didCompleteDatasetSucceed(reactionEvent);
150
138
  });
151
139
  if (params.reactor) {
@@ -193,7 +181,13 @@ export function createTransformDatasetContext(params) {
193
181
  runtime: runtime,
194
182
  context: { key: `dataset:${datasetId}` },
195
183
  durable: options.durable ?? false,
196
- options: { silent: true, preventClose: true, sendFinish: false, maxIterations: 20, maxModelSteps: 5 },
184
+ options: {
185
+ silent: true,
186
+ preventClose: true,
187
+ sendFinish: false,
188
+ maxIterations: 20,
189
+ maxModelSteps: 5,
190
+ },
197
191
  __initialContent: {
198
192
  datasetId,
199
193
  sourceDatasetIds: params.sourceDatasetIds,
@@ -0,0 +1,30 @@
1
+ import type { TransformPromptContext, TransformSandboxState, TransformSourcePreviewContext } from "./transform-dataset.types.js";
2
+ export declare function ensureTransformSourcesInSandboxStep(params: {
3
+ runtime: any;
4
+ sandboxId: string;
5
+ datasetId: string;
6
+ sourceDatasetIds: string[];
7
+ state: TransformSandboxState;
8
+ }): Promise<{
9
+ sourcePaths: Array<{
10
+ datasetId: string;
11
+ path: string;
12
+ }>;
13
+ outputPath: string;
14
+ state: TransformSandboxState;
15
+ }>;
16
+ export declare function generateTransformSourcePreviewsStep(params: {
17
+ runtime: any;
18
+ sandboxId: string;
19
+ datasetId: string;
20
+ sourcePaths: Array<{
21
+ datasetId: string;
22
+ path: string;
23
+ }>;
24
+ }): Promise<Array<{
25
+ datasetId: string;
26
+ preview: TransformSourcePreviewContext;
27
+ }>>;
28
+ export declare function buildTransformDatasetPromptStep(params: {
29
+ context: TransformPromptContext;
30
+ }): Promise<string>;
@@ -0,0 +1,62 @@
1
+ import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
2
+ import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
3
+ import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
4
+ import { generateSourcePreview } from "./filepreview.js";
5
+ import { buildTransformDatasetPrompt } from "./prompts.js";
6
+ export async function ensureTransformSourcesInSandboxStep(params) {
7
+ "use step";
8
+ if (params.state.initialized) {
9
+ return {
10
+ sourcePaths: params.state.sourcePaths,
11
+ outputPath: getDatasetOutputPath(params.datasetId),
12
+ state: params.state,
13
+ };
14
+ }
15
+ const workstation = getDatasetWorkstation(params.datasetId);
16
+ await runDatasetSandboxCommandStep({
17
+ runtime: params.runtime,
18
+ sandboxId: params.sandboxId,
19
+ cmd: "mkdir",
20
+ args: ["-p", workstation],
21
+ });
22
+ const sourcePaths = [];
23
+ for (const sourceDatasetId of params.sourceDatasetIds) {
24
+ const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
25
+ const source = await datasetReadOutputJsonlStep({
26
+ runtime: params.runtime,
27
+ datasetId: sourceDatasetId,
28
+ });
29
+ await writeDatasetSandboxFilesStep({
30
+ runtime: params.runtime,
31
+ sandboxId: params.sandboxId,
32
+ files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
33
+ });
34
+ sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
35
+ }
36
+ return {
37
+ sourcePaths,
38
+ outputPath: getDatasetOutputPath(params.datasetId),
39
+ state: {
40
+ initialized: true,
41
+ sourcePaths,
42
+ },
43
+ };
44
+ }
45
+ export async function generateTransformSourcePreviewsStep(params) {
46
+ "use step";
47
+ const sourcePreviews = [];
48
+ for (const sourcePath of params.sourcePaths) {
49
+ try {
50
+ const preview = await generateSourcePreview(params.runtime, params.sandboxId, sourcePath.path, params.datasetId);
51
+ sourcePreviews.push({ datasetId: sourcePath.datasetId, preview });
52
+ }
53
+ catch {
54
+ // Source preview is optional; transformation can still read the JSONL files.
55
+ }
56
+ }
57
+ return sourcePreviews;
58
+ }
59
+ export async function buildTransformDatasetPromptStep(params) {
60
+ "use step";
61
+ return buildTransformDatasetPrompt(params.context);
62
+ }
@@ -0,0 +1,86 @@
1
+ import type { ContextReactor } from "@ekairos/events";
2
+ import type { TransformSourcePreviewContext } from "./filepreview.js";
3
+ export type { TransformSourcePreviewContext } from "./filepreview.js";
4
+ export type TransformSandboxState = {
5
+ initialized: boolean;
6
+ sourcePaths: Array<{
7
+ datasetId: string;
8
+ path: string;
9
+ }>;
10
+ };
11
+ export type TransformDatasetContext = {
12
+ datasetId: string;
13
+ sourceDatasetIds: string[];
14
+ outputSchema: any;
15
+ sandboxConfig: {
16
+ sourcePaths: Array<{
17
+ datasetId: string;
18
+ path: string;
19
+ }>;
20
+ outputPath: string;
21
+ };
22
+ sourcePreviews?: Array<{
23
+ datasetId: string;
24
+ preview: TransformSourcePreviewContext;
25
+ }>;
26
+ errors: string[];
27
+ iterationCount: number;
28
+ instructions?: string;
29
+ };
30
+ export type TransformDatasetAgentParams = {
31
+ sourceDatasetIds?: string[];
32
+ outputSchema?: any;
33
+ instructions?: string;
34
+ datasetId?: string;
35
+ model?: string;
36
+ sandboxId?: string;
37
+ reactor?: ContextReactor<any, any>;
38
+ };
39
+ export type TransformDatasetRunOptions = {
40
+ prompt?: string;
41
+ durable?: boolean;
42
+ };
43
+ export type TransformDatasetResult = {
44
+ id: string;
45
+ status?: string;
46
+ title?: string;
47
+ schema?: any;
48
+ analysis?: any;
49
+ calculatedTotalRows?: number;
50
+ actualGeneratedRowCount?: number;
51
+ createdAt?: number;
52
+ updatedAt?: number;
53
+ };
54
+ export type TransformPromptContext = {
55
+ datasetId: string;
56
+ sourceDatasetIds: string[];
57
+ outputSchema: any;
58
+ sandboxConfig: {
59
+ sourcePaths: Array<{
60
+ datasetId: string;
61
+ path: string;
62
+ }>;
63
+ outputPath: string;
64
+ };
65
+ sourcePreviews?: Array<{
66
+ datasetId: string;
67
+ preview: {
68
+ totalRows: number;
69
+ metadata?: {
70
+ description: string;
71
+ script: string;
72
+ command: string;
73
+ stdout: string;
74
+ stderr: string;
75
+ };
76
+ head?: {
77
+ description: string;
78
+ script: string;
79
+ command: string;
80
+ stdout: string;
81
+ stderr: string;
82
+ };
83
+ };
84
+ }>;
85
+ errors: string[];
86
+ };
@@ -0,0 +1 @@
1
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.55-beta.development.0",
3
+ "version": "1.22.57-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.55-beta.development.0",
69
- "@ekairos/events": "^1.22.55-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.55-beta.development.0",
68
+ "@ekairos/domain": "^1.22.57-beta.development.0",
69
+ "@ekairos/events": "^1.22.57-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.57-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",