@ekairos/dataset 1.22.78-beta.development.0 → 1.22.80-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,6 @@
1
1
  import { tool } from "ai";
2
2
  import { z } from "zod";
3
- import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps.js";
4
- import Ajv from "ajv";
5
- import { getDatasetOutputPath, } from "./datasetFiles.js";
6
- import { datasetGetByIdStep, datasetUpdateStatusStep, datasetUploadOutputFileStep } from "./dataset/steps.js";
7
- let ajvInstance = null;
8
- function getAjv() {
9
- if (!ajvInstance) {
10
- ajvInstance = new Ajv({
11
- allErrors: true,
12
- strict: false,
13
- });
14
- }
15
- return ajvInstance;
16
- }
3
+ import { persistDatasetStep } from "./completeDataset.steps.js";
17
4
  export function createCompleteDatasetTool({ datasetId, sandboxId, runtime }) {
18
5
  return tool({
19
6
  description: "Mark the dataset as completed. Use only when output.jsonl has been successfully generated and is ready for validation.",
@@ -25,143 +12,12 @@ export function createCompleteDatasetTool({ datasetId, sandboxId, runtime }) {
25
12
  console.log(`[Dataset ${datasetId}] Tool: completeDataset`);
26
13
  console.log(`[Dataset ${datasetId}] Summary: ${summary}`);
27
14
  console.log(`[Dataset ${datasetId}] ========================================`);
28
- const outputPath = getDatasetOutputPath(datasetId);
29
- try {
30
- await ensureFileExists(runtime, sandboxId, outputPath);
31
- }
32
- catch (error) {
33
- const message = error instanceof Error ? error.message : String(error);
34
- console.error(`[Dataset ${datasetId}] Missing output file:`, message);
35
- return {
36
- success: false,
37
- status: "missing_output",
38
- validRows: 0,
39
- rowRecordCount: 0,
40
- validation: [],
41
- error: message,
42
- message,
43
- };
44
- }
45
- console.log(`[Dataset ${datasetId}] Validating dataset rows against schema`);
46
- const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
47
- if (!datasetResult.ok) {
48
- console.error(`[Dataset ${datasetId}] ${datasetResult.error}`);
49
- return {
50
- success: false,
51
- status: "dataset_not_found",
52
- validRows: 0,
53
- rowRecordCount: 0,
54
- validation: [],
55
- error: datasetResult.error,
56
- message: datasetResult.error,
57
- };
58
- }
59
- const datasetRecord = datasetResult.data;
60
- if (!datasetRecord.schema) {
61
- console.error(`[Dataset ${datasetId}] Schema not found in database`);
62
- return {
63
- success: false,
64
- status: "schema_missing",
65
- validRows: 0,
66
- rowRecordCount: 0,
67
- validation: [],
68
- error: "Schema not found in database. Please generate schema first.",
69
- message: "Schema not found in database. Please generate schema first.",
70
- };
71
- }
72
- const schemaJson = datasetRecord.schema.schema;
73
- let validator;
74
- try {
75
- validator = getAjv().compile(schemaJson);
76
- }
77
- catch (error) {
78
- const message = error instanceof Error ? error.message : String(error);
79
- console.error(`[Dataset ${datasetId}] Failed to compile schema:`, message);
80
- return {
81
- success: false,
82
- status: "schema_invalid",
83
- validRows: 0,
84
- rowRecordCount: 0,
85
- validation: [],
86
- error: `Failed to compile schema: ${message}`,
87
- message: `Failed to compile schema: ${message}`,
88
- };
89
- }
90
- const validationResult = await validateJsonlRows({
91
- runtime,
92
- sandboxId,
93
- outputPath,
94
- validator,
95
- datasetId,
96
- });
97
- if (!validationResult.success) {
98
- return validationResult;
99
- }
100
- const totalValidRows = validationResult.validRowCount ?? 0;
101
- const rowRecordCount = validationResult.rowRecordCount ?? totalValidRows;
102
- console.log(`[Dataset ${datasetId}] Reading file content for upload`);
103
- const fileRead = await readDatasetSandboxFileStep({ runtime, sandboxId, path: outputPath });
104
- if (!fileRead.contentBase64) {
105
- console.error(`[Dataset ${datasetId}] Empty file content`);
106
- return {
107
- success: false,
108
- status: "empty_output",
109
- validRows: 0,
110
- rowRecordCount: 0,
111
- validation: [],
112
- error: "Empty file content",
113
- message: "Empty file content",
114
- };
115
- }
116
- console.log(`[Dataset ${datasetId}] Uploading file to InstantDB storage`);
117
- const uploadResult = await datasetUploadOutputFileStep({
118
- runtime,
119
- datasetId,
120
- contentBase64: fileRead.contentBase64,
121
- });
122
- if (!uploadResult.ok) {
123
- console.error(`[Dataset ${datasetId}] File upload failed: ${uploadResult.error}`);
124
- return {
125
- success: false,
126
- status: "upload_failed",
127
- validRows: totalValidRows,
128
- rowRecordCount,
129
- validation: validationResult.validation,
130
- error: uploadResult.error,
131
- message: uploadResult.error,
132
- };
133
- }
134
- console.log(`[Dataset ${datasetId}] File uploaded successfully: ${uploadResult.data.fileId}`);
135
- const statusResult = await datasetUpdateStatusStep({
15
+ return await persistDatasetStep({
136
16
  runtime,
137
17
  datasetId,
138
- status: "completed",
139
- calculatedTotalRows: totalValidRows,
140
- actualGeneratedRowCount: totalValidRows,
18
+ sandboxId,
19
+ summary,
141
20
  });
142
- if (!statusResult.ok) {
143
- console.error(`[Dataset ${datasetId}] Failed to update status: ${statusResult.error}`);
144
- return {
145
- success: false,
146
- status: "status_update_failed",
147
- validRows: totalValidRows,
148
- rowRecordCount,
149
- validation: validationResult.validation,
150
- error: statusResult.error,
151
- message: statusResult.error,
152
- };
153
- }
154
- console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
155
- console.log(`[Dataset ${datasetId}] ========================================`);
156
- return {
157
- success: true,
158
- status: "completed",
159
- validRows: totalValidRows,
160
- rowRecordCount,
161
- fileId: uploadResult.data.fileId,
162
- storagePath: uploadResult.data.storagePath,
163
- message: "Dataset creation completed and uploaded to storage",
164
- };
165
21
  },
166
22
  });
167
23
  }
@@ -204,117 +60,3 @@ export function getDatasetFatalFailure(event) {
204
60
  }
205
61
  return null;
206
62
  }
207
- async function ensureFileExists(runtime, sandboxId, path) {
208
- const result = await runDatasetSandboxCommandStep({
209
- runtime,
210
- sandboxId,
211
- cmd: "test",
212
- args: ["-f", path],
213
- });
214
- if (result.exitCode !== 0) {
215
- throw new Error(`Required file not found: ${path}`);
216
- }
217
- }
218
- async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, datasetId }) {
219
- const validation = [];
220
- let validRowCount = 0;
221
- let rowRecordCount = 0;
222
- console.log(`[Dataset ${datasetId}] Reading and validating JSONL file from sandbox`);
223
- const fileRead = await readDatasetSandboxTextFileStep({ runtime, sandboxId, path: outputPath });
224
- if (!fileRead.content) {
225
- console.log(`[Dataset ${datasetId}] Empty output file`);
226
- return {
227
- success: false,
228
- status: "empty_output",
229
- validation,
230
- validRowCount: 0,
231
- rowRecordCount: 0,
232
- error: "output.jsonl is empty",
233
- message: "output.jsonl is empty",
234
- };
235
- }
236
- const lines = fileRead.content.split("\n");
237
- console.log(`[Dataset ${datasetId}] Validating ${lines.length} lines`);
238
- for (let index = 0; index < lines.length; index++) {
239
- const line = lines[index];
240
- const trimmed = line.trim();
241
- if (trimmed.length === 0) {
242
- continue;
243
- }
244
- let record;
245
- try {
246
- record = JSON.parse(trimmed);
247
- }
248
- catch (error) {
249
- const message = error instanceof Error ? error.message : String(error);
250
- validation.push({
251
- index,
252
- valid: false,
253
- errors: [`Invalid JSON: ${message}`],
254
- });
255
- continue;
256
- }
257
- if (record.type !== "row") {
258
- validation.push({
259
- index,
260
- valid: false,
261
- errors: ["Every non-empty output line must be a JSON object with type 'row'"],
262
- });
263
- continue;
264
- }
265
- rowRecordCount++;
266
- const data = record.data;
267
- if (data === undefined || data === null) {
268
- validation.push({
269
- index,
270
- valid: false,
271
- errors: ["Missing 'data' field"],
272
- });
273
- continue;
274
- }
275
- const valid = validator(data);
276
- if (!valid) {
277
- const errors = Array.isArray(validator.errors)
278
- ? validator.errors.map((err) => err.message || "Unknown validation error")
279
- : ["Unknown validation error"];
280
- validation.push({
281
- index,
282
- valid: false,
283
- errors,
284
- dataKeys: data && typeof data === "object" && !Array.isArray(data) ? Object.keys(data) : [],
285
- });
286
- continue;
287
- }
288
- validation.push({
289
- index,
290
- valid: true,
291
- });
292
- validRowCount++;
293
- }
294
- console.log(`[Dataset ${datasetId}] Validation completed: ${validRowCount} valid rows`);
295
- const invalidRows = validation.filter((entry) => !entry.valid);
296
- if (rowRecordCount === 0 || validRowCount === 0 || invalidRows.length > 0) {
297
- const message = rowRecordCount === 0
298
- ? "output.jsonl does not contain any type='row' records"
299
- : validRowCount === 0
300
- ? "No dataset rows matched the stored schema"
301
- : `${invalidRows.length} dataset row(s) failed schema validation`;
302
- console.error(`[Dataset ${datasetId}] Validation failed: ${message}`);
303
- return {
304
- success: false,
305
- status: "validation_failed",
306
- validation,
307
- validRowCount,
308
- rowRecordCount,
309
- error: message,
310
- message,
311
- };
312
- }
313
- return {
314
- success: true,
315
- status: "completed",
316
- validation,
317
- validRowCount,
318
- rowRecordCount,
319
- };
320
- }
@@ -1,3 +1,4 @@
1
+ export declare function getDatasetRuntimeDb(runtime: any): Promise<any>;
1
2
  export declare function getDatasetServiceDb(runtime: any): Promise<any>;
2
3
  export declare function datasetGetByIdStep(params: {
3
4
  runtime: any;
@@ -1,7 +1,7 @@
1
1
  import { DatasetService } from "../service.js";
2
2
  import { datasetDomain } from "../schema.js";
3
3
  import { inferDatasetSchema } from "../builder/schemaInference.js";
4
- async function getRuntimeDb(runtime) {
4
+ export async function getDatasetRuntimeDb(runtime) {
5
5
  if (!runtime) {
6
6
  throw new Error("Dataset step requires runtime.");
7
7
  }
@@ -15,17 +15,17 @@ async function getRuntimeDb(runtime) {
15
15
  }
16
16
  export async function getDatasetServiceDb(runtime) {
17
17
  "use step";
18
- return await getRuntimeDb(runtime);
18
+ return await getDatasetRuntimeDb(runtime);
19
19
  }
20
20
  export async function datasetGetByIdStep(params) {
21
21
  "use step";
22
- const db = await getRuntimeDb(params.runtime);
22
+ const db = await getDatasetRuntimeDb(params.runtime);
23
23
  const service = new DatasetService(db);
24
24
  return await service.getDatasetById(params.datasetId);
25
25
  }
26
26
  export async function datasetReadOutputJsonlStep(params) {
27
27
  "use step";
28
- const db = await getRuntimeDb(params.runtime);
28
+ const db = await getDatasetRuntimeDb(params.runtime);
29
29
  for (let attempt = 1; attempt <= 20; attempt++) {
30
30
  const query = await db.query({
31
31
  dataset_datasets: {
@@ -46,7 +46,7 @@ export async function datasetReadOutputJsonlStep(params) {
46
46
  }
47
47
  export async function datasetUpdateSchemaStep(params) {
48
48
  "use step";
49
- const db = await getRuntimeDb(params.runtime);
49
+ const db = await getDatasetRuntimeDb(params.runtime);
50
50
  const service = new DatasetService(db);
51
51
  return await service.updateDatasetSchema({
52
52
  datasetId: params.datasetId,
@@ -56,7 +56,7 @@ export async function datasetUpdateSchemaStep(params) {
56
56
  }
57
57
  export async function datasetUploadOutputFileStep(params) {
58
58
  "use step";
59
- const db = await getRuntimeDb(params.runtime);
59
+ const db = await getDatasetRuntimeDb(params.runtime);
60
60
  const service = new DatasetService(db);
61
61
  return await service.uploadDatasetOutputFile({
62
62
  datasetId: params.datasetId,
@@ -65,7 +65,7 @@ export async function datasetUploadOutputFileStep(params) {
65
65
  }
66
66
  export async function datasetUpdateStatusStep(params) {
67
67
  "use step";
68
- const db = await getRuntimeDb(params.runtime);
68
+ const db = await getDatasetRuntimeDb(params.runtime);
69
69
  const service = new DatasetService(db);
70
70
  return await service.updateDatasetStatus({
71
71
  datasetId: params.datasetId,
@@ -76,13 +76,13 @@ export async function datasetUpdateStatusStep(params) {
76
76
  }
77
77
  export async function datasetClearStep(params) {
78
78
  "use step";
79
- const db = await getRuntimeDb(params.runtime);
79
+ const db = await getDatasetRuntimeDb(params.runtime);
80
80
  const service = new DatasetService(db);
81
81
  return await service.clearDataset(params.datasetId);
82
82
  }
83
83
  export async function datasetPreviewRowsStep(params) {
84
84
  "use step";
85
- const db = await getRuntimeDb(params.runtime);
85
+ const db = await getDatasetRuntimeDb(params.runtime);
86
86
  const service = new DatasetService(db);
87
87
  const rowsResult = await service.previewRows(params.datasetId, params.limit ?? 20);
88
88
  if (!rowsResult.ok) {
@@ -92,7 +92,7 @@ export async function datasetPreviewRowsStep(params) {
92
92
  }
93
93
  export async function datasetReadRowsStep(params) {
94
94
  "use step";
95
- const db = await getRuntimeDb(params.runtime);
95
+ const db = await getDatasetRuntimeDb(params.runtime);
96
96
  const service = new DatasetService(db);
97
97
  const rowsResult = await service.readRows({
98
98
  datasetId: params.datasetId,
@@ -106,7 +106,7 @@ export async function datasetReadRowsStep(params) {
106
106
  }
107
107
  export async function datasetReadOneStep(params) {
108
108
  "use step";
109
- const db = await getRuntimeDb(params.runtime);
109
+ const db = await getDatasetRuntimeDb(params.runtime);
110
110
  const service = new DatasetService(db);
111
111
  const firstResult = await service.readOne(params.datasetId);
112
112
  if (!firstResult.ok) {
@@ -116,7 +116,7 @@ export async function datasetReadOneStep(params) {
116
116
  }
117
117
  export async function datasetInferAndUpdateSchemaStep(params) {
118
118
  "use step";
119
- const db = await getRuntimeDb(params.runtime);
119
+ const db = await getDatasetRuntimeDb(params.runtime);
120
120
  const service = new DatasetService(db);
121
121
  const readResult = await service.readRows({
122
122
  datasetId: params.datasetId,
package/dist/dataset.js CHANGED
@@ -1,8 +1,8 @@
1
1
  import { buildObjectOutputInstructions } from "./builder/instructions.js";
2
2
  import { createDatasetId } from "./id.js";
3
- import { materializeDerivedDataset, materializeSingleFileLikeSource, } from "./builder/materialize.js";
3
+ import { completeDatasetStep, materializeDerivedDataset, materializeSingleFileLikeSource, } from "./builder/materialize.js";
4
4
  import { materializeQuerySource } from "./builder/materializeQuery.js";
5
- import { finalizeBuildResult } from "./builder/persistence.js";
5
+ import { createDatasetBuildResult, finalizeBuildResult, } from "./builder/persistence.js";
6
6
  export function dataset(runtime, options = {}) {
7
7
  const datasetId = normalizeDatasetId(options.datasetId);
8
8
  const typedRuntime = runtime;
@@ -132,13 +132,25 @@ export function dataset(runtime, options = {}) {
132
132
  throw new Error("dataset_reactor_required");
133
133
  }
134
134
  await materializeSingleFileLikeSource(effectiveState, onlySource, targetDatasetId);
135
- return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
135
+ const completed = await completeDatasetStep({
136
+ runtime: effectiveState.runtime,
137
+ datasetId: targetDatasetId,
138
+ schema: effectiveState.outputSchema,
139
+ first: effectiveState.first,
140
+ });
141
+ return finalizeOutputResult(createDatasetBuildResult(effectiveState.runtime, completed), effectiveState.output);
136
142
  }
137
143
  if (!effectiveState.reactor) {
138
144
  throw new Error("dataset_reactor_required");
139
145
  }
140
146
  await materializeDerivedDataset(effectiveState, targetDatasetId);
141
- return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
147
+ const completed = await completeDatasetStep({
148
+ runtime: effectiveState.runtime,
149
+ datasetId: targetDatasetId,
150
+ schema: effectiveState.outputSchema,
151
+ first: effectiveState.first,
152
+ });
153
+ return finalizeOutputResult(createDatasetBuildResult(effectiveState.runtime, completed), effectiveState.output);
142
154
  },
143
155
  };
144
156
  return api;
@@ -1,4 +1,9 @@
1
1
  export declare const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
2
2
  export declare function getDatasetWorkdirBase(): string;
3
3
  export declare function getDatasetWorkstation(datasetId: string): string;
4
+ export declare function getDatasetSourcesDir(datasetId: string): string;
5
+ export declare function getDatasetScriptsDir(datasetId: string): string;
6
+ export declare function getDatasetArtifactsDir(datasetId: string): string;
7
+ export declare function getDatasetLogsDir(datasetId: string): string;
8
+ export declare function getDatasetStandardDirs(datasetId: string): string[];
4
9
  export declare function getDatasetOutputPath(datasetId: string): string;
@@ -21,6 +21,27 @@ export function getDatasetWorkdirBase() {
21
21
  export function getDatasetWorkstation(datasetId) {
22
22
  return `${getDatasetWorkdirBase()}/${datasetId}`;
23
23
  }
24
+ export function getDatasetSourcesDir(datasetId) {
25
+ return `${getDatasetWorkstation(datasetId)}/sources`;
26
+ }
27
+ export function getDatasetScriptsDir(datasetId) {
28
+ return `${getDatasetWorkstation(datasetId)}/scripts`;
29
+ }
30
+ export function getDatasetArtifactsDir(datasetId) {
31
+ return `${getDatasetWorkstation(datasetId)}/artifacts`;
32
+ }
33
+ export function getDatasetLogsDir(datasetId) {
34
+ return `${getDatasetWorkstation(datasetId)}/logs`;
35
+ }
36
+ export function getDatasetStandardDirs(datasetId) {
37
+ return [
38
+ getDatasetWorkstation(datasetId),
39
+ getDatasetSourcesDir(datasetId),
40
+ getDatasetScriptsDir(datasetId),
41
+ getDatasetArtifactsDir(datasetId),
42
+ getDatasetLogsDir(datasetId),
43
+ ];
44
+ }
24
45
  export function getDatasetOutputPath(datasetId) {
25
46
  return `${getDatasetWorkstation(datasetId)}/${DATASET_OUTPUT_FILE_NAME}`;
26
47
  }
@@ -1,7 +1,7 @@
1
1
  import { tool } from "ai";
2
2
  import { z } from "zod";
3
3
  import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFilesStep } from "./sandbox/steps.js";
4
- import { getDatasetWorkstation } from "./datasetFiles.js";
4
+ import { getDatasetScriptsDir } from "./datasetFiles.js";
5
5
  // To keep responses predictable for big data scenarios, we cap stdout/stderr.
6
6
  // The tool's return payload exposes stdout (capped) plus the on-disk script path.
7
7
  const MAX_STDOUT_CHARS = 20000;
@@ -30,10 +30,9 @@ export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
30
30
  scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A deterministic suffix will be appended automatically."),
31
31
  }),
32
32
  execute: async ({ pythonCode, scriptName }) => {
33
- const workstation = getDatasetWorkstation(datasetId);
34
33
  const normalizedScriptName = normalizeScriptName(scriptName);
35
34
  const scriptHash = stableScriptHash(`${normalizedScriptName}\0${pythonCode}`);
36
- const scriptFile = `${workstation}/${normalizedScriptName}-${scriptHash}.py`;
35
+ const scriptFile = `${getDatasetScriptsDir(datasetId)}/${normalizedScriptName}-${scriptHash}.py`;
37
36
  console.log(`[Dataset ${datasetId}] ========================================`);
38
37
  console.log(`[Dataset ${datasetId}] Tool: executeCommand`);
39
38
  console.log(`[Dataset ${datasetId}] Script: ${normalizedScriptName}`);
@@ -1,5 +1,5 @@
1
1
  import { type ContextReactor } from "@ekairos/events";
2
- import type { FileParseRunOptions } from "./file-dataset.types.js";
2
+ import type { FileParseContext, FileParseRunOptions, SandboxState } from "./file-dataset.types.js";
3
3
  export type { DatasetResult, FileParseContext, FileParseContextBuilder, FileParseContextParams, FileParseRunOptions, SandboxState, } from "./file-dataset.types.js";
4
4
  export declare function createFileParseContext<Env extends {
5
5
  orgId: string;
@@ -9,6 +9,9 @@ export declare function createFileParseContext<Env extends {
9
9
  datasetId?: string;
10
10
  model?: string;
11
11
  reactor?: ContextReactor<any, any>;
12
+ sandboxState?: SandboxState;
13
+ filePreview?: FileParseContext["filePreview"];
14
+ schema?: any | null;
12
15
  }): {
13
16
  datasetId: string;
14
17
  parse(runtime: {
@@ -21,7 +21,8 @@ function createFileParseContextDefinition(params) {
21
21
  let contextBuilder = createContext("file.parse")
22
22
  .context(async (stored, _env, runtime) => {
23
23
  const previous = stored?.content ?? {};
24
- const sandboxState = previous?.sandboxState ?? { initialized: false, filePath: "" };
24
+ const sandboxState = previous?.sandboxState ??
25
+ params.sandboxState ?? { initialized: false, filePath: "" };
25
26
  const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
26
27
  const fileId = previous?.fileId ?? params.fileId ?? "";
27
28
  const instructions = previous?.instructions ?? params.instructions ?? "";
@@ -35,27 +36,31 @@ function createFileParseContextDefinition(params) {
35
36
  if (!sandboxId) {
36
37
  throw new Error("dataset_sandbox_required");
37
38
  }
38
- const initialized = await initializeFileParseSandboxStep({
39
- runtime,
40
- sandboxId,
41
- datasetId,
42
- fileId,
43
- state: sandboxState,
44
- });
45
- const sandboxFilePath = initialized.filePath;
46
- let filePreview = undefined;
47
- try {
48
- filePreview = await generateFileParsePreviewStep({
39
+ const initialized = sandboxState.initialized && sandboxState.filePath
40
+ ? { filePath: sandboxState.filePath, state: sandboxState }
41
+ : await initializeFileParseSandboxStep({
49
42
  runtime,
50
43
  sandboxId,
51
- sandboxFilePath,
52
44
  datasetId,
45
+ fileId,
46
+ state: sandboxState,
53
47
  });
48
+ const sandboxFilePath = initialized.filePath;
49
+ let filePreview = previous?.filePreview ?? previous?.ctx?.filePreview ?? params.filePreview;
50
+ if (!filePreview) {
51
+ try {
52
+ filePreview = await generateFileParsePreviewStep({
53
+ runtime,
54
+ sandboxId,
55
+ sandboxFilePath,
56
+ datasetId,
57
+ });
58
+ }
59
+ catch {
60
+ // Preview is optional; parsing can still proceed from the file path.
61
+ }
54
62
  }
55
- catch {
56
- // Preview is optional; parsing can still proceed from the file path.
57
- }
58
- let schema = null;
63
+ let schema = previous?.ctx?.schema ?? previous?.schema ?? params.schema ?? null;
59
64
  const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
60
65
  if (datasetResult.ok && datasetResult.data.schema) {
61
66
  schema = datasetResult.data.schema;
@@ -80,6 +85,7 @@ function createFileParseContextDefinition(params) {
80
85
  instructions,
81
86
  sandboxId,
82
87
  sandboxState: initialized.state,
88
+ filePreview,
83
89
  ctx,
84
90
  };
85
91
  })
@@ -160,6 +166,9 @@ export function createFileParseContext(fileId, opts) {
160
166
  datasetId,
161
167
  model: opts?.model,
162
168
  reactor: opts?.reactor,
169
+ sandboxState: opts?.sandboxState,
170
+ filePreview: opts?.filePreview,
171
+ schema: opts?.schema,
163
172
  };
164
173
  const { context } = createFileParseContextDefinition(params);
165
174
  return {
@@ -191,11 +200,14 @@ export function createFileParseContext(fileId, opts) {
191
200
  maxModelSteps: 5,
192
201
  },
193
202
  __initialContent: {
203
+ ...(options.initialContent ?? {}),
194
204
  datasetId,
195
205
  fileId,
196
206
  instructions: opts?.instructions ?? "",
197
207
  sandboxId: opts?.sandboxId ?? "",
198
- sandboxState: { initialized: false, filePath: "" },
208
+ sandboxState: opts?.sandboxState ?? { initialized: false, filePath: "" },
209
+ filePreview: opts?.filePreview,
210
+ schema: opts?.schema,
199
211
  },
200
212
  });
201
213
  await awaitContextRun(shell.run);
@@ -1,4 +1,4 @@
1
- import { getDatasetWorkstation } from "../datasetFiles.js";
1
+ import { getDatasetSourcesDir, getDatasetStandardDirs, getDatasetWorkstation, } from "../datasetFiles.js";
2
2
  import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
3
  import { buildFileDatasetPrompt } from "./prompts.js";
4
4
  import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
@@ -29,11 +29,11 @@ export async function initializeFileParseSandboxStep(params) {
29
29
  runtime: params.runtime,
30
30
  sandboxId: params.sandboxId,
31
31
  cmd: "mkdir",
32
- args: ["-p", workstation],
32
+ args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
33
33
  });
34
34
  const fileName = file.contentDisposition ?? "";
35
35
  const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
36
- const sandboxFilePath = `${workstation}/${params.fileId}${fileExtension}`;
36
+ const sandboxFilePath = `${getDatasetSourcesDir(params.datasetId)}/${params.fileId}${fileExtension}`;
37
37
  await writeDatasetSandboxFilesStep({
38
38
  runtime: params.runtime,
39
39
  sandboxId: params.sandboxId,
@@ -26,10 +26,14 @@ export type FileParseContextParams = {
26
26
  datasetId?: string;
27
27
  model?: string;
28
28
  reactor?: ContextReactor<any, any>;
29
+ sandboxState?: SandboxState;
30
+ filePreview?: FilePreviewContext;
31
+ schema?: any | null;
29
32
  };
30
33
  export type FileParseRunOptions = {
31
34
  prompt?: string;
32
35
  durable?: boolean;
36
+ initialContent?: Record<string, any>;
33
37
  };
34
38
  export type FileParseContextBuilder<Env extends {
35
39
  orgId: string;