@ekairos/dataset 1.22.54-beta.development.0 → 1.22.56-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,59 +1,6 @@
1
- import { createContext, type ContextReactor } from "@ekairos/events";
2
- import { FilePreviewContext } from "./filepreview.js";
3
- export type FileParseContext = {
4
- datasetId: string;
5
- fileId: string;
6
- instructions: string;
7
- sandboxConfig: {
8
- filePath: string;
9
- };
10
- analysis: any[];
11
- schema: any | null;
12
- plan: any | null;
13
- executionResult: any | null;
14
- errors: string[];
15
- iterationCount: number;
16
- filePreview?: FilePreviewContext;
17
- };
18
- export type FileParseContextParams = {
19
- fileId: string;
20
- instructions?: string;
21
- sandboxId?: string;
22
- datasetId?: string;
23
- model?: string;
24
- reactor?: ContextReactor<any, any>;
25
- };
26
- export type FileParseRunOptions = {
27
- prompt?: string;
28
- durable?: boolean;
29
- };
30
- export type FileParseContextBuilder<Env extends {
31
- orgId: string;
32
- }> = {
33
- datasetId: string;
34
- context: ReturnType<ReturnType<typeof createContext<Env>>["context"]> extends any ? any : any;
35
- };
36
- export type DatasetResult = {
37
- id: string;
38
- status?: string;
39
- title?: string;
40
- schema?: any;
41
- analysis?: any;
42
- calculatedTotalRows?: number;
43
- actualGeneratedRowCount?: number;
44
- createdAt?: number;
45
- updatedAt?: number;
46
- };
47
- /**
48
- * Factory (DX-first):
49
- *
50
- * Usage:
51
- * const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
52
- *
53
- * - Uses the caller runtime; no secondary runtime is created.
54
- * - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
55
- * - `parse()` is the entrypoint; it calls `context.react(...)` internally.
56
- */
1
+ import { type ContextReactor } from "@ekairos/events";
2
+ import type { FileParseRunOptions } from "./file-dataset.types.js";
3
+ export type { DatasetResult, FileParseContext, FileParseContextBuilder, FileParseContextParams, FileParseRunOptions, SandboxState, } from "./file-dataset.types.js";
57
4
  export declare function createFileParseContext<Env extends {
58
5
  orgId: string;
59
6
  }>(fileId: string, opts?: {
@@ -71,3 +18,9 @@ export declare function createFileParseContext<Env extends {
71
18
  }>;
72
19
  context: any;
73
20
  };
21
+ export declare function registerFileParseContext<Env extends {
22
+ orgId: string;
23
+ }>(opts?: {
24
+ model?: string;
25
+ reactor?: ContextReactor<any, any>;
26
+ }): void;
@@ -1,15 +1,11 @@
1
- import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
2
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
- import { createGenerateSchemaTool } from "./generateSchema.tool.js";
4
- import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
5
- import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
- import { createClearDatasetTool } from "../clearDataset.tool.js";
7
- import { buildFileDatasetPrompt } from "./prompts.js";
8
- import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
1
+ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
9
2
  import { id } from "@instantdb/admin";
10
- import { getDatasetWorkstation } from "../datasetFiles.js";
11
- import { readInstantFileStep } from "./steps.js";
3
+ import { createClearDatasetTool } from "../clearDataset.tool.js";
4
+ import { createCompleteDatasetTool, didCompleteDatasetSucceed, } from "../completeDataset.tool.js";
12
5
  import { datasetGetByIdStep } from "../dataset/steps.js";
6
+ import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { createGenerateSchemaTool } from "./generateSchema.tool.js";
8
+ import { buildFileDatasetPromptStep, generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
13
9
  async function awaitContextRun(run) {
14
10
  if (!run)
15
11
  return;
@@ -19,89 +15,55 @@ async function awaitContextRun(run) {
19
15
  }
20
16
  await run;
21
17
  }
22
- async function initializeSandbox(runtime, sandboxId, datasetId, fileId, state) {
23
- if (state.initialized) {
24
- return state.filePath;
25
- }
26
- console.log(`[FileParseContext ${datasetId}] Initializing sandbox...`);
27
- await ensurePreviewScriptsAvailable(runtime, sandboxId);
28
- console.log(`[FileParseContext ${datasetId}] Installing Python dependencies...`);
29
- const pipInstall = await runDatasetSandboxCommandStep({
30
- runtime,
31
- sandboxId,
32
- cmd: "python",
33
- args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
34
- });
35
- const installStderr = pipInstall.stderr;
36
- if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
37
- throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
38
- }
39
- console.log(`[FileParseContext ${datasetId}] Fetching file from InstantDB...`);
40
- const file = await readInstantFileStep({ runtime, fileId });
41
- console.log(`[FileParseContext ${datasetId}] Creating dataset workstation...`);
42
- const workstation = getDatasetWorkstation(datasetId);
43
- await runDatasetSandboxCommandStep({
44
- runtime,
45
- sandboxId,
46
- cmd: "mkdir",
47
- args: ["-p", workstation],
48
- });
49
- const fileName = file.contentDisposition ?? "";
50
- const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
51
- const sandboxFilePath = `${workstation}/${fileId}${fileExtension}`;
52
- await writeDatasetSandboxFilesStep({
53
- runtime,
54
- sandboxId,
55
- files: [
56
- {
57
- path: sandboxFilePath,
58
- contentBase64: file.contentBase64,
59
- },
60
- ],
61
- });
62
- console.log(`[FileParseContext ${datasetId}] ✅ Workstation created: ${workstation}`);
63
- console.log(`[FileParseContext ${datasetId}] ✅ File saved: ${sandboxFilePath}`);
64
- state.filePath = sandboxFilePath;
65
- state.initialized = true;
66
- return sandboxFilePath;
67
- }
68
- /**
69
- * FileParseContext
70
- *
71
- * Uso:
72
- * - Crear una instancia con `fileId`, `instructions` y un `sandbox`
73
- * - Llamar `getDataset()` para crear un dataset nuevo (crea un datasetId interno)
74
- * - Llamar `followUp(datasetId, feedback)` para iterar el mismo dataset con feedback
75
- *
76
- * Internamente corre un Context (`createContext("file.parse")`) que itera hasta que se ejecuta el tool `completeDataset`.
77
- */
78
18
  function createFileParseContextDefinition(params) {
79
- const datasetId = params.datasetId ?? id();
19
+ const fallbackDatasetId = params.datasetId;
80
20
  const model = params.model ?? "openai/gpt-5";
81
21
  let contextBuilder = createContext("file.parse")
82
22
  .context(async (stored, _env, runtime) => {
83
23
  const previous = stored?.content ?? {};
84
24
  const sandboxState = previous?.sandboxState ?? { initialized: false, filePath: "" };
25
+ const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
26
+ const fileId = previous?.fileId ?? params.fileId ?? "";
27
+ const instructions = previous?.instructions ?? params.instructions ?? "";
85
28
  const sandboxId = previous?.sandboxId ?? params.sandboxId ?? "";
29
+ if (!datasetId) {
30
+ throw new Error("dataset_id_required");
31
+ }
32
+ if (!fileId) {
33
+ throw new Error("dataset_file_id_required");
34
+ }
86
35
  if (!sandboxId) {
87
36
  throw new Error("dataset_sandbox_required");
88
37
  }
89
- const sandboxFilePath = await initializeSandbox(runtime, sandboxId, datasetId, params.fileId, sandboxState);
38
+ const initialized = await initializeFileParseSandboxStep({
39
+ runtime,
40
+ sandboxId,
41
+ datasetId,
42
+ fileId,
43
+ state: sandboxState,
44
+ });
45
+ const sandboxFilePath = initialized.filePath;
90
46
  let filePreview = undefined;
91
47
  try {
92
- filePreview = await generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId);
48
+ filePreview = await generateFileParsePreviewStep({
49
+ runtime,
50
+ sandboxId,
51
+ sandboxFilePath,
52
+ datasetId,
53
+ });
93
54
  }
94
55
  catch {
95
- // optional
56
+ // Preview is optional; parsing can still proceed from the file path.
96
57
  }
97
58
  let schema = null;
98
59
  const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
99
- if (datasetResult.ok && datasetResult.data.schema)
60
+ if (datasetResult.ok && datasetResult.data.schema) {
100
61
  schema = datasetResult.data.schema;
62
+ }
101
63
  const ctx = {
102
64
  datasetId,
103
- fileId: params.fileId,
104
- instructions: params.instructions ?? "",
65
+ fileId,
66
+ instructions,
105
67
  sandboxConfig: { filePath: sandboxFilePath },
106
68
  analysis: [],
107
69
  schema,
@@ -114,16 +76,16 @@ function createFileParseContextDefinition(params) {
114
76
  return {
115
77
  ...previous,
116
78
  datasetId,
117
- fileId: params.fileId,
118
- instructions: params.instructions ?? "",
79
+ fileId,
80
+ instructions,
119
81
  sandboxId,
120
- sandboxState,
82
+ sandboxState: initialized.state,
121
83
  ctx,
122
84
  };
123
85
  })
124
86
  .narrative(async (stored) => {
125
87
  const ctx = stored?.content?.ctx;
126
- const base = buildFileDatasetPrompt(ctx);
88
+ const base = await buildFileDatasetPromptStep({ context: ctx });
127
89
  const userInstructions = String(ctx?.instructions ?? "").trim();
128
90
  if (!userInstructions)
129
91
  return base;
@@ -138,27 +100,36 @@ function createFileParseContextDefinition(params) {
138
100
  })
139
101
  .actions(async (_stored, _env, runtime) => {
140
102
  const existingSchema = _stored?.content?.ctx?.schema?.schema;
103
+ const datasetId = _stored?.content?.datasetId ?? fallbackDatasetId ?? "";
104
+ const fileId = _stored?.content?.fileId ?? params.fileId ?? "";
105
+ const sandboxId = _stored?.content?.sandboxId ?? params.sandboxId ?? "";
106
+ if (!datasetId)
107
+ throw new Error("dataset_id_required");
108
+ if (!fileId)
109
+ throw new Error("dataset_file_id_required");
110
+ if (!sandboxId)
111
+ throw new Error("dataset_sandbox_required");
141
112
  const actions = {
142
113
  executeCommand: createExecuteCommandTool({
143
114
  datasetId,
144
- sandboxId: _stored?.content?.sandboxId ?? params.sandboxId ?? "",
115
+ sandboxId,
145
116
  runtime,
146
117
  }),
147
118
  completeDataset: createCompleteDatasetTool({
148
119
  datasetId,
149
- sandboxId: _stored?.content?.sandboxId ?? params.sandboxId ?? "",
120
+ sandboxId,
150
121
  runtime,
151
122
  }),
152
123
  clearDataset: createClearDatasetTool({
153
124
  datasetId,
154
- sandboxId: _stored?.content?.sandboxId ?? params.sandboxId ?? "",
125
+ sandboxId,
155
126
  runtime,
156
127
  }),
157
128
  };
158
129
  if (!existingSchema) {
159
130
  actions.generateSchema = createGenerateSchemaTool({
160
131
  datasetId,
161
- fileId: params.fileId,
132
+ fileId,
162
133
  runtime,
163
134
  });
164
135
  }
@@ -174,28 +145,19 @@ function createFileParseContextDefinition(params) {
174
145
  contextBuilder = contextBuilder.model(model);
175
146
  }
176
147
  const context = contextBuilder.build();
177
- return { datasetId, context };
148
+ return { datasetId: fallbackDatasetId ?? "", context };
178
149
  }
179
- /**
180
- * Factory (DX-first):
181
- *
182
- * Usage:
183
- * const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
184
- *
185
- * - Uses the caller runtime; no secondary runtime is created.
186
- * - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
187
- * - `parse()` is the entrypoint; it calls `context.react(...)` internally.
188
- */
189
150
  export function createFileParseContext(fileId, opts) {
151
+ const datasetId = opts?.datasetId ?? id();
190
152
  const params = {
191
153
  fileId,
192
154
  instructions: opts?.instructions,
193
155
  sandboxId: opts?.sandboxId,
194
- datasetId: opts?.datasetId,
156
+ datasetId,
195
157
  model: opts?.model,
196
158
  reactor: opts?.reactor,
197
159
  };
198
- const { datasetId, context } = createFileParseContextDefinition(params);
160
+ const { context } = createFileParseContextDefinition(params);
199
161
  return {
200
162
  datasetId,
201
163
  async parse(runtime, options = {}) {
@@ -205,19 +167,43 @@ export function createFileParseContext(fileId, opts) {
205
167
  channel: WEB_CHANNEL,
206
168
  createdAt: new Date().toISOString(),
207
169
  content: {
208
- parts: [{ type: "text", text: options.prompt ?? "generate a dataset for this file" }],
170
+ parts: [
171
+ {
172
+ type: "text",
173
+ text: options.prompt ?? "generate a dataset for this file",
174
+ },
175
+ ],
209
176
  },
210
177
  };
211
178
  const shell = await context.react(triggerEvent, {
212
179
  runtime: runtime,
213
180
  context: { key: `dataset:${datasetId}` },
214
181
  durable: options.durable ?? false,
215
- options: { silent: true, preventClose: true, sendFinish: false, maxIterations: 20, maxModelSteps: 5 },
182
+ options: {
183
+ silent: true,
184
+ preventClose: true,
185
+ sendFinish: false,
186
+ maxIterations: 20,
187
+ maxModelSteps: 5,
188
+ },
189
+ __initialContent: {
190
+ datasetId,
191
+ fileId,
192
+ instructions: opts?.instructions ?? "",
193
+ sandboxId: opts?.sandboxId ?? "",
194
+ sandboxState: { initialized: false, filePath: "" },
195
+ },
216
196
  });
217
197
  await awaitContextRun(shell.run);
218
198
  return { datasetId };
219
199
  },
220
- // Optional: expose the built context for advanced callers (not required for parse DX)
221
200
  context,
222
201
  };
223
202
  }
203
+ export function registerFileParseContext(opts) {
204
+ createFileParseContextDefinition({
205
+ model: opts?.model,
206
+ reactor: opts?.reactor,
207
+ }).context;
208
+ }
209
+ registerFileParseContext();
@@ -0,0 +1,21 @@
1
+ import type { FileParseContext, SandboxState } from "./file-dataset.types.js";
2
+ import type { FilePreviewContext } from "./filepreview.types.js";
3
+ export declare function initializeFileParseSandboxStep(params: {
4
+ runtime: any;
5
+ sandboxId: string;
6
+ datasetId: string;
7
+ fileId: string;
8
+ state: SandboxState;
9
+ }): Promise<{
10
+ filePath: string;
11
+ state: SandboxState;
12
+ }>;
13
+ export declare function generateFileParsePreviewStep(params: {
14
+ runtime: any;
15
+ sandboxId: string;
16
+ sandboxFilePath: string;
17
+ datasetId: string;
18
+ }): Promise<FilePreviewContext>;
19
+ export declare function buildFileDatasetPromptStep(params: {
20
+ context: FileParseContext;
21
+ }): Promise<string>;
@@ -0,0 +1,62 @@
1
+ import { getDatasetWorkstation } from "../datasetFiles.js";
2
+ import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
+ import { buildFileDatasetPrompt } from "./prompts.js";
4
+ import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
5
+ import { readInstantFileStep } from "./steps.js";
6
+ export async function initializeFileParseSandboxStep(params) {
7
+ "use step";
8
+ if (params.state.initialized) {
9
+ return { filePath: params.state.filePath, state: params.state };
10
+ }
11
+ console.log(`[FileParseContext ${params.datasetId}] Initializing sandbox...`);
12
+ await ensurePreviewScriptsAvailable(params.runtime, params.sandboxId);
13
+ console.log(`[FileParseContext ${params.datasetId}] Installing Python dependencies...`);
14
+ const pipInstall = await runDatasetSandboxCommandStep({
15
+ runtime: params.runtime,
16
+ sandboxId: params.sandboxId,
17
+ cmd: "python",
18
+ args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
19
+ });
20
+ const installStderr = pipInstall.stderr;
21
+ if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
22
+ throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
23
+ }
24
+ console.log(`[FileParseContext ${params.datasetId}] Fetching file from InstantDB...`);
25
+ const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
26
+ console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
27
+ const workstation = getDatasetWorkstation(params.datasetId);
28
+ await runDatasetSandboxCommandStep({
29
+ runtime: params.runtime,
30
+ sandboxId: params.sandboxId,
31
+ cmd: "mkdir",
32
+ args: ["-p", workstation],
33
+ });
34
+ const fileName = file.contentDisposition ?? "";
35
+ const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
36
+ const sandboxFilePath = `${workstation}/${params.fileId}${fileExtension}`;
37
+ await writeDatasetSandboxFilesStep({
38
+ runtime: params.runtime,
39
+ sandboxId: params.sandboxId,
40
+ files: [
41
+ {
42
+ path: sandboxFilePath,
43
+ contentBase64: file.contentBase64,
44
+ },
45
+ ],
46
+ });
47
+ console.log(`[FileParseContext ${params.datasetId}] Workstation created: ${workstation}`);
48
+ console.log(`[FileParseContext ${params.datasetId}] File saved: ${sandboxFilePath}`);
49
+ const state = {
50
+ initialized: true,
51
+ filePath: sandboxFilePath,
52
+ };
53
+ return { filePath: sandboxFilePath, state };
54
+ }
55
+ export async function generateFileParsePreviewStep(params) {
56
+ "use step";
57
+ return await generateFilePreview(params.runtime, params.sandboxId, params.sandboxFilePath, params.datasetId);
58
+ }
59
+ export async function buildFileDatasetPromptStep(params) {
60
+ "use step";
61
+ return buildFileDatasetPrompt(params.context);
62
+ }
@@ -0,0 +1,50 @@
1
+ import type { ContextReactor } from "@ekairos/events";
2
+ import type { FilePreviewContext } from "./filepreview.types.js";
3
+ export type SandboxState = {
4
+ initialized: boolean;
5
+ filePath: string;
6
+ };
7
+ export type FileParseContext = {
8
+ datasetId: string;
9
+ fileId: string;
10
+ instructions: string;
11
+ sandboxConfig: {
12
+ filePath: string;
13
+ };
14
+ analysis: any[];
15
+ schema: any | null;
16
+ plan: any | null;
17
+ executionResult: any | null;
18
+ errors: string[];
19
+ iterationCount: number;
20
+ filePreview?: FilePreviewContext;
21
+ };
22
+ export type FileParseContextParams = {
23
+ fileId?: string;
24
+ instructions?: string;
25
+ sandboxId?: string;
26
+ datasetId?: string;
27
+ model?: string;
28
+ reactor?: ContextReactor<any, any>;
29
+ };
30
+ export type FileParseRunOptions = {
31
+ prompt?: string;
32
+ durable?: boolean;
33
+ };
34
+ export type FileParseContextBuilder<Env extends {
35
+ orgId: string;
36
+ }> = {
37
+ datasetId: string;
38
+ context: any;
39
+ };
40
+ export type DatasetResult = {
41
+ id: string;
42
+ status?: string;
43
+ title?: string;
44
+ schema?: any;
45
+ analysis?: any;
46
+ calculatedTotalRows?: number;
47
+ actualGeneratedRowCount?: number;
48
+ createdAt?: number;
49
+ updatedAt?: number;
50
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -1,34 +1,5 @@
1
- export type FilePreviewContext = {
2
- totalRows: number;
3
- metadata?: {
4
- description: string;
5
- script: string;
6
- command: string;
7
- stdout: string;
8
- stderr: string;
9
- };
10
- head?: {
11
- description: string;
12
- script: string;
13
- command: string;
14
- stdout: string;
15
- stderr: string;
16
- };
17
- tail?: {
18
- description: string;
19
- script: string;
20
- command: string;
21
- stdout: string;
22
- stderr: string;
23
- };
24
- mid?: {
25
- description: string;
26
- script: string;
27
- command: string;
28
- stdout: string;
29
- stderr: string;
30
- };
31
- };
1
+ import type { FilePreviewContext } from "./filepreview.types.js";
2
+ export type { FilePreviewContext } from "./filepreview.types.js";
32
3
  interface PreviewOptions {
33
4
  headLines?: number;
34
5
  tailLines?: number;
@@ -36,4 +7,3 @@ interface PreviewOptions {
36
7
  }
37
8
  export declare function ensurePreviewScriptsAvailable(runtime: any, sandboxId: string): Promise<void>;
38
9
  export declare function generateFilePreview(runtime: any, sandboxId: string, sandboxFilePath: string, datasetId: string, options?: PreviewOptions): Promise<FilePreviewContext>;
39
- export {};
@@ -0,0 +1,31 @@
1
+ export type FilePreviewContext = {
2
+ totalRows: number;
3
+ metadata?: {
4
+ description: string;
5
+ script: string;
6
+ command: string;
7
+ stdout: string;
8
+ stderr: string;
9
+ };
10
+ head?: {
11
+ description: string;
12
+ script: string;
13
+ command: string;
14
+ stdout: string;
15
+ stderr: string;
16
+ };
17
+ tail?: {
18
+ description: string;
19
+ script: string;
20
+ command: string;
21
+ stdout: string;
22
+ stderr: string;
23
+ };
24
+ mid?: {
25
+ description: string;
26
+ script: string;
27
+ command: string;
28
+ stdout: string;
29
+ stderr: string;
30
+ };
31
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -1,2 +1,2 @@
1
- import { FileParseContext } from "./file-dataset.agent.js";
1
+ import type { FileParseContext } from "./file-dataset.types.js";
2
2
  export declare function buildFileDatasetPrompt(context: FileParseContext): string;
package/dist/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import "./builder/materialize.js";
1
2
  export * from "./dataset.js";
2
3
  export * from "./domain.js";
3
4
  export * from "./materializeDataset.tool.js";
package/dist/index.js CHANGED
@@ -1,3 +1,4 @@
1
+ import "./builder/materialize.js";
1
2
  export * from "./dataset.js";
2
3
  export * from "./domain.js";
3
4
  export * from "./materializeDataset.tool.js";
@@ -1,2 +1,2 @@
1
- export { createTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, } from "./transform-dataset.agent.js";
1
+ export { createTransformDatasetContext, registerTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, type TransformDatasetRunOptions, } from "./transform-dataset.agent.js";
2
2
  export { transformDataset, type TransformDatasetInput, type TransformDatasetResult, } from "./transformDataset.js";
@@ -1,2 +1,2 @@
1
- export { createTransformDatasetContext, } from "./transform-dataset.agent.js";
1
+ export { createTransformDatasetContext, registerTransformDatasetContext, } from "./transform-dataset.agent.js";
2
2
  export { transformDataset, } from "./transformDataset.js";
@@ -1,34 +1,2 @@
1
- export type TransformPromptContext = {
2
- datasetId: string;
3
- sourceDatasetIds: string[];
4
- outputSchema: any;
5
- sandboxConfig: {
6
- sourcePaths: Array<{
7
- datasetId: string;
8
- path: string;
9
- }>;
10
- outputPath: string;
11
- };
12
- sourcePreviews?: Array<{
13
- datasetId: string;
14
- preview: {
15
- totalRows: number;
16
- metadata?: {
17
- description: string;
18
- script: string;
19
- command: string;
20
- stdout: string;
21
- stderr: string;
22
- };
23
- head?: {
24
- description: string;
25
- script: string;
26
- command: string;
27
- stdout: string;
28
- stderr: string;
29
- };
30
- };
31
- }>;
32
- errors: string[];
33
- };
1
+ import type { TransformPromptContext } from "./transform-dataset.types.js";
34
2
  export declare function buildTransformDatasetPrompt(context: TransformPromptContext): string;
@@ -1,48 +1,6 @@
1
1
  import { type ContextReactor } from "@ekairos/events";
2
- import { TransformSourcePreviewContext } from "./filepreview.js";
3
- export type TransformDatasetContext = {
4
- datasetId: string;
5
- sourceDatasetIds: string[];
6
- outputSchema: any;
7
- sandboxConfig: {
8
- sourcePaths: Array<{
9
- datasetId: string;
10
- path: string;
11
- }>;
12
- outputPath: string;
13
- };
14
- sourcePreviews?: Array<{
15
- datasetId: string;
16
- preview: TransformSourcePreviewContext;
17
- }>;
18
- errors: string[];
19
- iterationCount: number;
20
- instructions?: string;
21
- };
22
- export type TransformDatasetAgentParams = {
23
- sourceDatasetIds: string[];
24
- outputSchema: any;
25
- instructions?: string;
26
- datasetId?: string;
27
- model?: string;
28
- sandboxId?: string;
29
- reactor?: ContextReactor<any, any>;
30
- };
31
- export type TransformDatasetRunOptions = {
32
- prompt?: string;
33
- durable?: boolean;
34
- };
35
- export type TransformDatasetResult = {
36
- id: string;
37
- status?: string;
38
- title?: string;
39
- schema?: any;
40
- analysis?: any;
41
- calculatedTotalRows?: number;
42
- actualGeneratedRowCount?: number;
43
- createdAt?: number;
44
- updatedAt?: number;
45
- };
2
+ import type { TransformDatasetRunOptions } from "./transform-dataset.types.js";
3
+ export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
46
4
  export declare function createTransformDatasetContext<Env extends {
47
5
  orgId: string;
48
6
  }>(params: {
@@ -62,3 +20,9 @@ export declare function createTransformDatasetContext<Env extends {
62
20
  }>;
63
21
  context: any;
64
22
  };
23
+ export declare function registerTransformDatasetContext<Env extends {
24
+ orgId: string;
25
+ }>(opts?: {
26
+ model?: string;
27
+ reactor?: ContextReactor<any, any>;
28
+ }): void;
@@ -1,13 +1,10 @@
1
- import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
2
- import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
3
- import { createExecuteCommandTool } from "../executeCommand.tool.js";
4
- import { createClearDatasetTool } from "../clearDataset.tool.js";
5
- import { buildTransformDatasetPrompt } from "./prompts.js";
6
- import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
1
+ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
7
2
  import { id } from "@instantdb/admin";
8
- import { generateSourcePreview } from "./filepreview.js";
9
- import { datasetReadOutputJsonlStep, datasetUpdateSchemaStep } from "../dataset/steps.js";
10
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
+ import { createClearDatasetTool } from "../clearDataset.tool.js";
4
+ import { createCompleteDatasetTool, didCompleteDatasetSucceed, } from "../completeDataset.tool.js";
5
+ import { datasetUpdateSchemaStep } from "../dataset/steps.js";
6
+ import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { buildTransformDatasetPromptStep, ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "./transform-dataset.steps.js";
11
8
  async function awaitContextRun(run) {
12
9
  if (!run)
13
10
  return;
@@ -17,66 +14,68 @@ async function awaitContextRun(run) {
17
14
  }
18
15
  await run;
19
16
  }
20
- async function ensureSourcesInSandbox(runtime, sandboxId, datasetId, sourceDatasetIds, state) {
21
- if (state.initialized) {
22
- return { sourcePaths: state.sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
23
- }
24
- const workstation = getDatasetWorkstation(datasetId);
25
- await runDatasetSandboxCommandStep({ runtime, sandboxId, cmd: "mkdir", args: ["-p", workstation] });
26
- const sourcePaths = [];
27
- for (const sourceDatasetId of sourceDatasetIds) {
28
- const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
29
- const source = await datasetReadOutputJsonlStep({ runtime, datasetId: sourceDatasetId });
30
- await writeDatasetSandboxFilesStep({
31
- runtime,
32
- sandboxId,
33
- files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
34
- });
35
- sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
36
- }
37
- state.sourcePaths = sourcePaths;
38
- state.initialized = true;
39
- return { sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
40
- }
41
17
  function createTransformDatasetContextDefinition(params) {
42
- const datasetId = params.datasetId ?? id();
18
+ const fallbackDatasetId = params.datasetId;
43
19
  const model = params.model ?? "openai/gpt-5";
44
20
  let contextBuilder = createContext("dataset.transform")
45
21
  .context(async (stored, _env, runtime) => {
46
22
  const previous = stored?.content ?? {};
47
23
  const sandboxState = previous?.sandboxState ?? { initialized: false, sourcePaths: [] };
24
+ const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
25
+ const sourceDatasetIds = Array.isArray(previous?.sourceDatasetIds)
26
+ ? previous.sourceDatasetIds
27
+ : Array.isArray(params.sourceDatasetIds)
28
+ ? params.sourceDatasetIds
29
+ : [];
30
+ const outputSchema = previous?.outputSchema ?? params.outputSchema;
31
+ const instructions = previous?.instructions ?? params.instructions;
48
32
  const sandboxId = previous?.sandboxId ?? params.sandboxId ?? "";
33
+ if (!datasetId) {
34
+ throw new Error("dataset_id_required");
35
+ }
36
+ if (sourceDatasetIds.length === 0) {
37
+ throw new Error("dataset_transform_sources_required");
38
+ }
39
+ if (!outputSchema) {
40
+ throw new Error("dataset_transform_schema_required");
41
+ }
49
42
  if (!sandboxId) {
50
43
  throw new Error("dataset_sandbox_required");
51
44
  }
52
- const { sourcePaths, outputPath } = await ensureSourcesInSandbox(runtime, sandboxId, datasetId, params.sourceDatasetIds, sandboxState);
53
- const sourcePreviews = [];
54
- for (const sp of sourcePaths) {
55
- try {
56
- const preview = await generateSourcePreview(runtime, sandboxId, sp.path, datasetId);
57
- sourcePreviews.push({ datasetId: sp.datasetId, preview });
58
- }
59
- catch {
60
- // optional
61
- }
62
- }
63
- // Persist output schema on the dataset record (so completeDataset validates against it)
45
+ const initialized = await ensureTransformSourcesInSandboxStep({
46
+ runtime,
47
+ sandboxId,
48
+ datasetId,
49
+ sourceDatasetIds,
50
+ state: sandboxState,
51
+ });
52
+ const sourcePreviews = await generateTransformSourcePreviewsStep({
53
+ runtime,
54
+ sandboxId,
55
+ datasetId,
56
+ sourcePaths: initialized.sourcePaths,
57
+ });
64
58
  await datasetUpdateSchemaStep({
65
59
  runtime,
66
60
  datasetId,
67
- schema: params.outputSchema,
61
+ schema: outputSchema,
68
62
  status: "schema_complete",
69
63
  });
70
64
  const promptContext = {
71
65
  datasetId,
72
- sourceDatasetIds: params.sourceDatasetIds,
73
- outputSchema: params.outputSchema,
74
- sandboxConfig: { sourcePaths, outputPath },
66
+ sourceDatasetIds,
67
+ outputSchema,
68
+ sandboxConfig: {
69
+ sourcePaths: initialized.sourcePaths,
70
+ outputPath: initialized.outputPath,
71
+ },
75
72
  sourcePreviews: sourcePreviews.length > 0 ? sourcePreviews : undefined,
76
73
  errors: [],
77
74
  };
78
- const basePrompt = buildTransformDatasetPrompt(promptContext);
79
- const userInstructions = String(params.instructions ?? "").trim();
75
+ const basePrompt = await buildTransformDatasetPromptStep({
76
+ context: promptContext,
77
+ });
78
+ const userInstructions = String(instructions ?? "").trim();
80
79
  const system = userInstructions
81
80
  ? [
82
81
  "## USER INSTRUCTIONS",
@@ -90,17 +89,28 @@ function createTransformDatasetContextDefinition(params) {
90
89
  return {
91
90
  ...previous,
92
91
  datasetId,
92
+ sourceDatasetIds,
93
+ outputSchema,
94
+ instructions,
93
95
  sandboxId,
94
- sandboxState,
96
+ sandboxState: initialized.state,
95
97
  system,
96
- sandboxConfig: { sourcePaths, outputPath },
98
+ sandboxConfig: {
99
+ sourcePaths: initialized.sourcePaths,
100
+ outputPath: initialized.outputPath,
101
+ },
97
102
  };
98
103
  })
99
104
  .narrative(async (stored) => {
100
105
  return String(stored?.content?.system ?? "");
101
106
  })
102
107
  .actions(async (stored, _env, runtime) => {
108
+ const datasetId = stored?.content?.datasetId ?? fallbackDatasetId ?? "";
103
109
  const sandboxId = stored?.content?.sandboxId ?? params.sandboxId ?? "";
110
+ if (!datasetId)
111
+ throw new Error("dataset_id_required");
112
+ if (!sandboxId)
113
+ throw new Error("dataset_sandbox_required");
104
114
  return {
105
115
  executeCommand: createExecuteCommandTool({
106
116
  datasetId,
@@ -129,14 +139,15 @@ function createTransformDatasetContextDefinition(params) {
129
139
  contextBuilder = contextBuilder.model(model);
130
140
  }
131
141
  const context = contextBuilder.build();
132
- return { datasetId, context };
142
+ return { datasetId: fallbackDatasetId ?? "", context };
133
143
  }
134
144
  export function createTransformDatasetContext(params) {
135
- const { datasetId, context } = createTransformDatasetContextDefinition({
145
+ const datasetId = params.datasetId ?? id();
146
+ const { context } = createTransformDatasetContextDefinition({
136
147
  sourceDatasetIds: params.sourceDatasetIds,
137
148
  outputSchema: params.outputSchema,
138
149
  instructions: params.instructions,
139
- datasetId: params.datasetId,
150
+ datasetId,
140
151
  model: params.model,
141
152
  sandboxId: params.sandboxId,
142
153
  reactor: params.reactor,
@@ -166,7 +177,21 @@ export function createTransformDatasetContext(params) {
166
177
  runtime: runtime,
167
178
  context: { key: `dataset:${datasetId}` },
168
179
  durable: options.durable ?? false,
169
- options: { silent: true, preventClose: true, sendFinish: false, maxIterations: 20, maxModelSteps: 5 },
180
+ options: {
181
+ silent: true,
182
+ preventClose: true,
183
+ sendFinish: false,
184
+ maxIterations: 20,
185
+ maxModelSteps: 5,
186
+ },
187
+ __initialContent: {
188
+ datasetId,
189
+ sourceDatasetIds: params.sourceDatasetIds,
190
+ outputSchema: params.outputSchema,
191
+ instructions: params.instructions,
192
+ sandboxId: params.sandboxId ?? "",
193
+ sandboxState: { initialized: false, sourcePaths: [] },
194
+ },
170
195
  });
171
196
  await awaitContextRun(shell.run);
172
197
  return { datasetId };
@@ -174,3 +199,10 @@ export function createTransformDatasetContext(params) {
174
199
  context,
175
200
  };
176
201
  }
202
+ export function registerTransformDatasetContext(opts) {
203
+ createTransformDatasetContextDefinition({
204
+ model: opts?.model,
205
+ reactor: opts?.reactor,
206
+ }).context;
207
+ }
208
+ registerTransformDatasetContext();
@@ -0,0 +1,30 @@
1
+ import type { TransformPromptContext, TransformSandboxState, TransformSourcePreviewContext } from "./transform-dataset.types.js";
2
+ export declare function ensureTransformSourcesInSandboxStep(params: {
3
+ runtime: any;
4
+ sandboxId: string;
5
+ datasetId: string;
6
+ sourceDatasetIds: string[];
7
+ state: TransformSandboxState;
8
+ }): Promise<{
9
+ sourcePaths: Array<{
10
+ datasetId: string;
11
+ path: string;
12
+ }>;
13
+ outputPath: string;
14
+ state: TransformSandboxState;
15
+ }>;
16
+ export declare function generateTransformSourcePreviewsStep(params: {
17
+ runtime: any;
18
+ sandboxId: string;
19
+ datasetId: string;
20
+ sourcePaths: Array<{
21
+ datasetId: string;
22
+ path: string;
23
+ }>;
24
+ }): Promise<Array<{
25
+ datasetId: string;
26
+ preview: TransformSourcePreviewContext;
27
+ }>>;
28
+ export declare function buildTransformDatasetPromptStep(params: {
29
+ context: TransformPromptContext;
30
+ }): Promise<string>;
@@ -0,0 +1,62 @@
1
+ import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
2
+ import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
3
+ import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
4
+ import { generateSourcePreview } from "./filepreview.js";
5
+ import { buildTransformDatasetPrompt } from "./prompts.js";
6
+ export async function ensureTransformSourcesInSandboxStep(params) {
7
+ "use step";
8
+ if (params.state.initialized) {
9
+ return {
10
+ sourcePaths: params.state.sourcePaths,
11
+ outputPath: getDatasetOutputPath(params.datasetId),
12
+ state: params.state,
13
+ };
14
+ }
15
+ const workstation = getDatasetWorkstation(params.datasetId);
16
+ await runDatasetSandboxCommandStep({
17
+ runtime: params.runtime,
18
+ sandboxId: params.sandboxId,
19
+ cmd: "mkdir",
20
+ args: ["-p", workstation],
21
+ });
22
+ const sourcePaths = [];
23
+ for (const sourceDatasetId of params.sourceDatasetIds) {
24
+ const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
25
+ const source = await datasetReadOutputJsonlStep({
26
+ runtime: params.runtime,
27
+ datasetId: sourceDatasetId,
28
+ });
29
+ await writeDatasetSandboxFilesStep({
30
+ runtime: params.runtime,
31
+ sandboxId: params.sandboxId,
32
+ files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
33
+ });
34
+ sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
35
+ }
36
+ return {
37
+ sourcePaths,
38
+ outputPath: getDatasetOutputPath(params.datasetId),
39
+ state: {
40
+ initialized: true,
41
+ sourcePaths,
42
+ },
43
+ };
44
+ }
45
+ export async function generateTransformSourcePreviewsStep(params) {
46
+ "use step";
47
+ const sourcePreviews = [];
48
+ for (const sourcePath of params.sourcePaths) {
49
+ try {
50
+ const preview = await generateSourcePreview(params.runtime, params.sandboxId, sourcePath.path, params.datasetId);
51
+ sourcePreviews.push({ datasetId: sourcePath.datasetId, preview });
52
+ }
53
+ catch {
54
+ // Source preview is optional; transformation can still read the JSONL files.
55
+ }
56
+ }
57
+ return sourcePreviews;
58
+ }
59
+ export async function buildTransformDatasetPromptStep(params) {
60
+ "use step";
61
+ return buildTransformDatasetPrompt(params.context);
62
+ }
@@ -0,0 +1,86 @@
1
+ import type { ContextReactor } from "@ekairos/events";
2
+ import type { TransformSourcePreviewContext } from "./filepreview.js";
3
+ export type { TransformSourcePreviewContext } from "./filepreview.js";
4
+ export type TransformSandboxState = {
5
+ initialized: boolean;
6
+ sourcePaths: Array<{
7
+ datasetId: string;
8
+ path: string;
9
+ }>;
10
+ };
11
+ export type TransformDatasetContext = {
12
+ datasetId: string;
13
+ sourceDatasetIds: string[];
14
+ outputSchema: any;
15
+ sandboxConfig: {
16
+ sourcePaths: Array<{
17
+ datasetId: string;
18
+ path: string;
19
+ }>;
20
+ outputPath: string;
21
+ };
22
+ sourcePreviews?: Array<{
23
+ datasetId: string;
24
+ preview: TransformSourcePreviewContext;
25
+ }>;
26
+ errors: string[];
27
+ iterationCount: number;
28
+ instructions?: string;
29
+ };
30
+ export type TransformDatasetAgentParams = {
31
+ sourceDatasetIds?: string[];
32
+ outputSchema?: any;
33
+ instructions?: string;
34
+ datasetId?: string;
35
+ model?: string;
36
+ sandboxId?: string;
37
+ reactor?: ContextReactor<any, any>;
38
+ };
39
+ export type TransformDatasetRunOptions = {
40
+ prompt?: string;
41
+ durable?: boolean;
42
+ };
43
+ export type TransformDatasetResult = {
44
+ id: string;
45
+ status?: string;
46
+ title?: string;
47
+ schema?: any;
48
+ analysis?: any;
49
+ calculatedTotalRows?: number;
50
+ actualGeneratedRowCount?: number;
51
+ createdAt?: number;
52
+ updatedAt?: number;
53
+ };
54
+ export type TransformPromptContext = {
55
+ datasetId: string;
56
+ sourceDatasetIds: string[];
57
+ outputSchema: any;
58
+ sandboxConfig: {
59
+ sourcePaths: Array<{
60
+ datasetId: string;
61
+ path: string;
62
+ }>;
63
+ outputPath: string;
64
+ };
65
+ sourcePreviews?: Array<{
66
+ datasetId: string;
67
+ preview: {
68
+ totalRows: number;
69
+ metadata?: {
70
+ description: string;
71
+ script: string;
72
+ command: string;
73
+ stdout: string;
74
+ stderr: string;
75
+ };
76
+ head?: {
77
+ description: string;
78
+ script: string;
79
+ command: string;
80
+ stdout: string;
81
+ stderr: string;
82
+ };
83
+ };
84
+ }>;
85
+ errors: string[];
86
+ };
@@ -0,0 +1 @@
1
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.54-beta.development.0",
3
+ "version": "1.22.56-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.54-beta.development.0",
69
- "@ekairos/events": "^1.22.54-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.54-beta.development.0",
68
+ "@ekairos/domain": "^1.22.56-beta.development.0",
69
+ "@ekairos/events": "^1.22.56-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.56-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",