@ekairos/dataset 1.22.82-beta.development.0 → 1.22.84-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/dist/builder/agentMaterializers.d.ts +2 -2
  2. package/dist/builder/context.d.ts +7 -0
  3. package/dist/builder/context.js +192 -0
  4. package/dist/builder/instructions.d.ts +3 -3
  5. package/dist/builder/instructions.js +10 -10
  6. package/dist/builder/materialize.d.ts +12 -11
  7. package/dist/builder/materialize.js +122 -121
  8. package/dist/builder/materializeQuery.d.ts +3 -2
  9. package/dist/builder/materializeQuery.js +10 -19
  10. package/dist/builder/persistence.d.ts +4 -5
  11. package/dist/builder/persistence.js +20 -19
  12. package/dist/builder/types.d.ts +31 -24
  13. package/dist/completeDataset.steps.d.ts +9 -8
  14. package/dist/completeDataset.steps.js +18 -11
  15. package/dist/completeDataset.tool.d.ts +9 -8
  16. package/dist/completeDataset.tool.js +2 -1
  17. package/dist/contextWorkspace.d.ts +72 -0
  18. package/dist/contextWorkspace.js +218 -0
  19. package/dist/dataset.d.ts +1 -1
  20. package/dist/dataset.js +42 -29
  21. package/dist/datasetFiles.d.ts +1 -1
  22. package/dist/datasetFiles.js +3 -3
  23. package/dist/executeCommand.tool.d.ts +1 -43
  24. package/dist/executeCommand.tool.js +10 -3
  25. package/dist/file/file-dataset.agent.d.ts +2 -0
  26. package/dist/file/file-dataset.agent.js +51 -16
  27. package/dist/file/file-dataset.steps.d.ts +6 -0
  28. package/dist/file/file-dataset.steps.js +18 -21
  29. package/dist/file/file-dataset.types.d.ts +10 -0
  30. package/dist/file/prompts.js +16 -14
  31. package/dist/index.d.ts +1 -0
  32. package/dist/index.js +1 -0
  33. package/dist/materializeDataset.tool.d.ts +34 -26
  34. package/dist/materializeDataset.tool.js +40 -29
  35. package/dist/schema.d.ts +12 -2
  36. package/dist/schema.js +6 -3
  37. package/dist/service.d.ts +2 -2
  38. package/dist/service.js +6 -3
  39. package/dist/transform/filepreview.d.ts +2 -2
  40. package/dist/transform/filepreview.js +3 -3
  41. package/dist/transform/prompts.js +25 -25
  42. package/dist/transform/transform-dataset.agent.d.ts +4 -4
  43. package/dist/transform/transform-dataset.agent.js +29 -30
  44. package/dist/transform/transform-dataset.steps.d.ts +7 -7
  45. package/dist/transform/transform-dataset.steps.js +20 -20
  46. package/dist/transform/transform-dataset.types.d.ts +13 -13
  47. package/dist/transform/transformDataset.js +4 -4
  48. package/package.json +4 -4
  49. /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
  50. /package/dist/builder/{sourceRows.js → rows.js} +0 -0
package/dist/dataset.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import { buildObjectOutputInstructions } from "./builder/instructions.js";
2
+ import { resolveDatasetResourceContext } from "./builder/context.js";
2
3
  import { createDatasetId } from "./id.js";
3
- import { completeDatasetStep, materializeDerivedDataset, materializeSingleFileLikeSource, } from "./builder/materialize.js";
4
- import { materializeQuerySource } from "./builder/materializeQuery.js";
4
+ import { completeDatasetStep, materializeDerivedDataset, materializeSingleFileLikeResource, } from "./builder/materialize.js";
5
+ import { materializeQueryResource } from "./builder/materializeQuery.js";
5
6
  import { createDatasetBuildResult, finalizeBuildResult, } from "./builder/persistence.js";
6
7
  export function dataset(runtime, options = {}) {
7
8
  const datasetId = normalizeDatasetId(options.datasetId);
@@ -9,7 +10,7 @@ export function dataset(runtime, options = {}) {
9
10
  const state = {
10
11
  runtime: typedRuntime,
11
12
  env: typedRuntime.env,
12
- sources: [],
13
+ resources: [],
13
14
  output: "rows",
14
15
  inferSchema: false,
15
16
  durable: options.durable,
@@ -17,38 +18,46 @@ export function dataset(runtime, options = {}) {
17
18
  };
18
19
  const api = {
19
20
  datasetId,
20
- fromFile(source) {
21
- state.sources.push({ kind: "file", ...source });
21
+ fromFile(resource) {
22
+ state.resources.push({ kind: "file", ...resource });
22
23
  return api;
23
24
  },
24
- fromText(source) {
25
- state.sources.push({ kind: "text", ...source });
25
+ fromText(resource) {
26
+ state.resources.push({ kind: "text", ...resource });
26
27
  return api;
27
28
  },
28
- fromDataset(source) {
29
- state.sources.push({ kind: "dataset", ...source });
29
+ fromDataset(resource) {
30
+ state.resources.push({ kind: "dataset", ...resource });
30
31
  return api;
31
32
  },
32
- from(...sources) {
33
- for (const source of sources) {
34
- if ("kind" in source) {
35
- state.sources.push(source);
33
+ fromContext(context) {
34
+ state.resources.push({ kind: "context", ...context });
35
+ return api;
36
+ },
37
+ from(...resources) {
38
+ for (const resource of resources) {
39
+ if ("kind" in resource) {
40
+ state.resources.push(resource);
41
+ continue;
42
+ }
43
+ if ("fileId" in resource) {
44
+ state.resources.push({ kind: "file", ...resource });
36
45
  continue;
37
46
  }
38
- if ("fileId" in source) {
39
- state.sources.push({ kind: "file", ...source });
47
+ if ("datasetId" in resource) {
48
+ state.resources.push({ kind: "dataset", ...resource });
40
49
  continue;
41
50
  }
42
- if ("datasetId" in source) {
43
- state.sources.push({ kind: "dataset", ...source });
51
+ if ("id" in resource || "key" in resource) {
52
+ state.resources.push({ kind: "context", ...resource });
44
53
  continue;
45
54
  }
46
- state.sources.push({ kind: "text", ...source });
55
+ state.resources.push({ kind: "text", ...resource });
47
56
  }
48
57
  return api;
49
58
  },
50
- fromQuery(domain, source) {
51
- state.sources.push({ kind: "query", domain, ...source });
59
+ fromQuery(domain, resource) {
60
+ state.resources.push({ kind: "query", domain, ...resource });
52
61
  return api;
53
62
  },
54
63
  title(title) {
@@ -96,8 +105,8 @@ export function dataset(runtime, options = {}) {
96
105
  return api;
97
106
  },
98
107
  async build(options) {
99
- if (state.sources.length === 0) {
100
- throw new Error("dataset_sources_required");
108
+ if (state.resources.length === 0) {
109
+ throw new Error("dataset_resources_required");
101
110
  }
102
111
  const targetDatasetId = options?.datasetId
103
112
  ? normalizeDatasetId(options.datasetId)
@@ -106,6 +115,9 @@ export function dataset(runtime, options = {}) {
106
115
  ...state,
107
116
  durable: options?.durable ?? state.durable,
108
117
  };
118
+ const context = await resolveDatasetResourceContext(typedRuntime, targetDatasetId, stateWithBuildOptions.resources);
119
+ stateWithBuildOptions.resources = context.resources;
120
+ stateWithBuildOptions.contextId = context.contextId;
109
121
  const effectiveState = stateWithBuildOptions.output === "object"
110
122
  ? {
111
123
  ...stateWithBuildOptions,
@@ -113,25 +125,26 @@ export function dataset(runtime, options = {}) {
113
125
  instructions: buildObjectOutputInstructions(stateWithBuildOptions.instructions),
114
126
  }
115
127
  : stateWithBuildOptions;
116
- const onlySource = effectiveState.sources[0];
117
- const isSingleSource = effectiveState.sources.length === 1;
128
+ const onlyResource = effectiveState.resources[0];
129
+ const isSingleResource = effectiveState.resources.length === 1;
118
130
  const hasInstructions = Boolean(String(effectiveState.instructions ?? "").trim());
119
- if (isSingleSource && onlySource.kind === "query" && !hasInstructions) {
120
- await materializeQuerySource(effectiveState.runtime, onlySource, {
131
+ if (isSingleResource && onlyResource.kind === "query" && !hasInstructions) {
132
+ await materializeQueryResource(effectiveState.runtime, onlyResource, {
121
133
  datasetId: targetDatasetId,
122
134
  sandboxId: effectiveState.sandboxId,
123
135
  schema: effectiveState.outputSchema,
124
- title: effectiveState.title ?? onlySource.title,
136
+ title: effectiveState.title ?? onlyResource.title,
125
137
  instructions: effectiveState.instructions,
126
138
  first: effectiveState.first,
139
+ contextId: effectiveState.contextId ?? "",
127
140
  });
128
141
  return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
129
142
  }
130
- if (isSingleSource && (onlySource.kind === "file" || onlySource.kind === "text")) {
143
+ if (isSingleResource && (onlyResource.kind === "file" || onlyResource.kind === "text")) {
131
144
  if (!effectiveState.reactor) {
132
145
  throw new Error("dataset_reactor_required");
133
146
  }
134
- await materializeSingleFileLikeSource(effectiveState, onlySource, targetDatasetId);
147
+ await materializeSingleFileLikeResource(effectiveState, onlyResource, targetDatasetId);
135
148
  const completed = await completeDatasetStep({
136
149
  runtime: effectiveState.runtime,
137
150
  datasetId: targetDatasetId,
@@ -1,7 +1,7 @@
1
1
  export declare const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
2
2
  export declare function getDatasetWorkdirBase(): string;
3
3
  export declare function getDatasetWorkstation(datasetId: string): string;
4
- export declare function getDatasetSourcesDir(datasetId: string): string;
4
+ export declare function getDatasetResourcesDir(datasetId: string): string;
5
5
  export declare function getDatasetScriptsDir(datasetId: string): string;
6
6
  export declare function getDatasetArtifactsDir(datasetId: string): string;
7
7
  export declare function getDatasetLogsDir(datasetId: string): string;
@@ -9,8 +9,8 @@ export function getDatasetWorkdirBase() {
9
9
  export function getDatasetWorkstation(datasetId) {
10
10
  return `${getDatasetWorkdirBase()}/${datasetId}`;
11
11
  }
12
- export function getDatasetSourcesDir(datasetId) {
13
- return `${getDatasetWorkstation(datasetId)}/sources`;
12
+ export function getDatasetResourcesDir(datasetId) {
13
+ return `${getDatasetWorkstation(datasetId)}/resources`;
14
14
  }
15
15
  export function getDatasetScriptsDir(datasetId) {
16
16
  return `${getDatasetWorkstation(datasetId)}/scripts`;
@@ -24,7 +24,7 @@ export function getDatasetLogsDir(datasetId) {
24
24
  export function getDatasetStandardDirs(datasetId) {
25
25
  return [
26
26
  getDatasetWorkstation(datasetId),
27
- getDatasetSourcesDir(datasetId),
27
+ getDatasetResourcesDir(datasetId),
28
28
  getDatasetScriptsDir(datasetId),
29
29
  getDatasetArtifactsDir(datasetId),
30
30
  getDatasetLogsDir(datasetId),
@@ -6,47 +6,5 @@ interface ExecuteCommandToolParams {
6
6
  export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime }: ExecuteCommandToolParams): import("ai").Tool<{
7
7
  pythonCode: string;
8
8
  scriptName: string;
9
- }, {
10
- success: boolean;
11
- fatal: boolean;
12
- status: string;
13
- error: string;
14
- stdout: string;
15
- stderr: string;
16
- exitCode: number;
17
- scriptPath: string;
18
- stdoutTruncated: boolean;
19
- stderrTruncated: boolean;
20
- stdoutOriginalLength: number;
21
- stderrOriginalLength: number;
22
- message?: undefined;
23
- } | {
24
- success: boolean;
25
- exitCode: number;
26
- stdout: string;
27
- stderr: string;
28
- scriptPath: string;
29
- error: string;
30
- stdoutTruncated: boolean;
31
- stderrTruncated: boolean;
32
- stdoutOriginalLength: number;
33
- stderrOriginalLength: number;
34
- fatal?: undefined;
35
- status?: undefined;
36
- message?: undefined;
37
- } | {
38
- success: boolean;
39
- exitCode: number;
40
- stdout: string;
41
- stderr: string;
42
- scriptPath: string;
43
- message: string;
44
- stdoutTruncated: boolean;
45
- stderrTruncated: boolean;
46
- stdoutOriginalLength: number;
47
- stderrOriginalLength: number;
48
- fatal?: undefined;
49
- status?: undefined;
50
- error?: undefined;
51
- }>;
9
+ }, unknown>;
52
10
  export {};
@@ -2,6 +2,7 @@ import { tool } from "ai";
2
2
  import { z } from "zod";
3
3
  import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFilesStep } from "./sandbox/steps.js";
4
4
  import { getDatasetScriptsDir } from "./datasetFiles.js";
5
+ import { getContextExecutionWorkspaceDirs } from "./contextWorkspace.js";
5
6
  // To keep responses predictable for big data scenarios, we cap stdout/stderr.
6
7
  // The tool's return payload exposes stdout (capped) plus the on-disk script path.
7
8
  const MAX_STDOUT_CHARS = 20000;
@@ -29,10 +30,16 @@ export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
29
30
  pythonCode: z.string().describe("Python code to execute. Saved to a file before running. MANDATORY: Use print() to report progress and final results. Keep prints concise; avoid dumping rows/JSON. For large outputs, write to files in the workstation directory and print only file paths and brief summaries."),
30
31
  scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A deterministic suffix will be appended automatically."),
31
32
  }),
32
- execute: async ({ pythonCode, scriptName }) => {
33
+ execute: (async ({ pythonCode, scriptName }, actionContext) => {
33
34
  const normalizedScriptName = normalizeScriptName(scriptName);
34
35
  const scriptHash = stableScriptHash(`${normalizedScriptName}\0${pythonCode}`);
35
- const scriptFile = `${getDatasetScriptsDir(datasetId)}/${normalizedScriptName}-${scriptHash}.py`;
36
+ const scriptsDir = actionContext?.contextId && actionContext.executionId
37
+ ? getContextExecutionWorkspaceDirs({
38
+ contextId: actionContext.contextId,
39
+ executionId: actionContext.executionId,
40
+ }).scriptsDir
41
+ : getDatasetScriptsDir(datasetId);
42
+ const scriptFile = `${scriptsDir}/${normalizedScriptName}-${scriptHash}.py`;
36
43
  console.log(`[Dataset ${datasetId}] ========================================`);
37
44
  console.log(`[Dataset ${datasetId}] Tool: executeCommand`);
38
45
  console.log(`[Dataset ${datasetId}] Script: ${normalizedScriptName}`);
@@ -162,6 +169,6 @@ export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
162
169
  stderrOriginalLength: 0,
163
170
  };
164
171
  }
165
- },
172
+ }),
166
173
  });
167
174
  }
@@ -12,6 +12,8 @@ export declare function createFileParseContext<Env extends {
12
12
  sandboxState?: SandboxState;
13
13
  filePreview?: FileParseContext["filePreview"];
14
14
  schema?: any | null;
15
+ filename?: string;
16
+ mediaType?: string;
15
17
  }): {
16
18
  datasetId: string;
17
19
  parse(runtime: {
@@ -4,7 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
4
4
  import { datasetGetByIdStep } from "../dataset/steps.js";
5
5
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
6
  import { createGenerateSchemaTool } from "./generateSchema.tool.js";
7
- import { buildFileDatasetPromptStep, generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
7
+ import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
8
8
  import { createDatasetId } from "../id.js";
9
9
  async function awaitContextRun(run) {
10
10
  if (!run)
@@ -27,6 +27,15 @@ function createFileParseContextDefinition(params) {
27
27
  const fileId = previous?.fileId ?? params.fileId ?? "";
28
28
  const instructions = previous?.instructions ?? params.instructions ?? "";
29
29
  const sandboxId = previous?.sandboxId ?? params.sandboxId ?? "";
30
+ const contextRun = runtime?.__ekairosContextRun ?? {};
31
+ const contextId = String(contextRun.contextId ?? stored?.id ?? "").trim();
32
+ const executionId = String(contextRun.executionId ?? previous?.executionId ?? "").trim();
33
+ const sourceEventId = String(previous?.sourceEventId ?? params.sourceEventId ?? "").trim();
34
+ const sourcePartIndex = typeof previous?.sourcePartIndex === "number"
35
+ ? previous.sourcePartIndex
36
+ : typeof params.sourcePartIndex === "number"
37
+ ? params.sourcePartIndex
38
+ : 0;
30
39
  if (!datasetId) {
31
40
  throw new Error("dataset_id_required");
32
41
  }
@@ -36,30 +45,29 @@ function createFileParseContextDefinition(params) {
36
45
  if (!sandboxId) {
37
46
  throw new Error("dataset_sandbox_required");
38
47
  }
48
+ if (!contextId) {
49
+ throw new Error("dataset_context_id_required");
50
+ }
51
+ if (!executionId) {
52
+ throw new Error("dataset_execution_id_required");
53
+ }
39
54
  const initialized = sandboxState.initialized && sandboxState.filePath
40
55
  ? { filePath: sandboxState.filePath, state: sandboxState }
41
56
  : await initializeFileParseSandboxStep({
42
57
  runtime,
43
58
  sandboxId,
59
+ contextId,
60
+ executionId,
44
61
  datasetId,
45
62
  fileId,
63
+ sourceEventId,
64
+ sourcePartIndex,
65
+ filename: previous?.filename ?? params.filename,
66
+ mediaType: previous?.mediaType ?? params.mediaType,
46
67
  state: sandboxState,
47
68
  });
48
69
  const sandboxFilePath = initialized.filePath;
49
70
  let filePreview = previous?.filePreview ?? previous?.ctx?.filePreview ?? params.filePreview;
50
- if (!filePreview) {
51
- try {
52
- filePreview = await generateFileParsePreviewStep({
53
- runtime,
54
- sandboxId,
55
- sandboxFilePath,
56
- datasetId,
57
- });
58
- }
59
- catch {
60
- // Preview is optional; parsing can still proceed from the file path.
61
- }
62
- }
63
71
  let schema = previous?.ctx?.schema ?? previous?.schema ?? params.schema ?? null;
64
72
  const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
65
73
  if (datasetResult.ok && datasetResult.data.schema) {
@@ -69,7 +77,12 @@ function createFileParseContextDefinition(params) {
69
77
  datasetId,
70
78
  fileId,
71
79
  instructions,
72
- sandboxConfig: { filePath: sandboxFilePath },
80
+ sandboxConfig: {
81
+ filePath: sandboxFilePath,
82
+ outputPath: initialized.state.outputPath,
83
+ scriptsDir: initialized.state.scriptsDir,
84
+ manifestPath: initialized.state.manifestPath,
85
+ },
73
86
  analysis: [],
74
87
  schema,
75
88
  plan: null,
@@ -84,6 +97,11 @@ function createFileParseContextDefinition(params) {
84
97
  fileId,
85
98
  instructions,
86
99
  sandboxId,
100
+ executionId,
101
+ sourceEventId,
102
+ sourcePartIndex,
103
+ filename: previous?.filename ?? params.filename,
104
+ mediaType: previous?.mediaType ?? params.mediaType,
87
105
  sandboxState: initialized.state,
88
106
  filePreview,
89
107
  ctx,
@@ -109,6 +127,7 @@ function createFileParseContextDefinition(params) {
109
127
  const datasetId = _stored?.content?.datasetId ?? fallbackDatasetId ?? "";
110
128
  const fileId = _stored?.content?.fileId ?? params.fileId ?? "";
111
129
  const sandboxId = _stored?.content?.sandboxId ?? params.sandboxId ?? "";
130
+ const outputPath = _stored?.content?.ctx?.sandboxConfig?.outputPath;
112
131
  if (!datasetId)
113
132
  throw new Error("dataset_id_required");
114
133
  if (!fileId)
@@ -125,6 +144,7 @@ function createFileParseContextDefinition(params) {
125
144
  datasetId,
126
145
  sandboxId,
127
146
  runtime,
147
+ outputPath,
128
148
  }),
129
149
  clearDataset: createClearDatasetTool({
130
150
  datasetId,
@@ -169,6 +189,8 @@ export function createFileParseContext(fileId, opts) {
169
189
  sandboxState: opts?.sandboxState,
170
190
  filePreview: opts?.filePreview,
171
191
  schema: opts?.schema,
192
+ filename: opts?.filename,
193
+ mediaType: opts?.mediaType,
172
194
  };
173
195
  const { context } = createFileParseContextDefinition(params);
174
196
  return {
@@ -185,15 +207,24 @@ export function createFileParseContext(fileId, opts) {
185
207
  type: "text",
186
208
  text: options.prompt ?? "generate a dataset for this file",
187
209
  },
210
+ {
211
+ type: "file",
212
+ fileId,
213
+ filename: opts?.filename ?? "resource-file",
214
+ mediaType: opts?.mediaType ?? "application/octet-stream",
215
+ },
188
216
  ],
189
217
  },
190
218
  };
219
+ params.sourceEventId = triggerEvent.id;
220
+ params.sourcePartIndex = 1;
221
+ params.filename = opts?.filename ?? "resource-file";
222
+ params.mediaType = opts?.mediaType ?? "application/octet-stream";
191
223
  const shell = await context.react(triggerEvent, {
192
224
  runtime: runtime,
193
225
  context: { key: `dataset:${datasetId}` },
194
226
  durable: options.durable ?? false,
195
227
  options: {
196
- silent: true,
197
228
  preventClose: true,
198
229
  sendFinish: false,
199
230
  maxIterations: 20,
@@ -203,6 +234,10 @@ export function createFileParseContext(fileId, opts) {
203
234
  ...(options.initialContent ?? {}),
204
235
  datasetId,
205
236
  fileId,
237
+ sourceEventId: triggerEvent.id,
238
+ sourcePartIndex: 1,
239
+ filename: opts?.filename ?? "resource-file",
240
+ mediaType: opts?.mediaType ?? "application/octet-stream",
206
241
  instructions: opts?.instructions ?? "",
207
242
  sandboxId: opts?.sandboxId ?? "",
208
243
  sandboxState: opts?.sandboxState ?? { initialized: false, filePath: "" },
@@ -3,8 +3,14 @@ import type { FilePreviewContext } from "./filepreview.types.js";
3
3
  export declare function initializeFileParseSandboxStep(params: {
4
4
  runtime: any;
5
5
  sandboxId: string;
6
+ contextId: string;
7
+ executionId: string;
6
8
  datasetId: string;
7
9
  fileId: string;
10
+ sourceEventId?: string;
11
+ sourcePartIndex?: number;
12
+ filename?: string;
13
+ mediaType?: string;
8
14
  state: SandboxState;
9
15
  }): Promise<{
10
16
  filePath: string;
@@ -1,42 +1,39 @@
1
- import { getDatasetSourcesDir, getDatasetStandardDirs, getDatasetWorkstation, } from "../datasetFiles.js";
2
- import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
1
+ import { DATASET_OUTPUT_FILE_NAME } from "../datasetFiles.js";
2
+ import { prepareContextExecutionWorkspaceStep } from "../contextWorkspace.js";
3
3
  import { buildFileDatasetPrompt } from "./prompts.js";
4
4
  import { generateFilePreview } from "./filepreview.js";
5
- import { readInstantFileStep } from "./steps.js";
6
5
  export async function initializeFileParseSandboxStep(params) {
7
6
  "use step";
8
7
  if (params.state.initialized) {
9
8
  return { filePath: params.state.filePath, state: params.state };
10
9
  }
11
- console.log(`[FileParseContext ${params.datasetId}] Preparing source file in sandbox...`);
12
- console.log(`[FileParseContext ${params.datasetId}] Fetching file from InstantDB...`);
13
- const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
14
- console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
15
- const workstation = getDatasetWorkstation(params.datasetId);
16
- await runDatasetSandboxCommandStep({
17
- runtime: params.runtime,
18
- sandboxId: params.sandboxId,
19
- cmd: "mkdir",
20
- args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
21
- });
22
- const fileName = file.contentDisposition ?? "";
23
- const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
24
- const sandboxFilePath = `${getDatasetSourcesDir(params.datasetId)}/${params.fileId}${fileExtension}`;
25
- await writeDatasetSandboxFilesStep({
10
+ console.log(`[FileParseContext ${params.datasetId}] Preparing context execution workspace...`);
11
+ const workspace = await prepareContextExecutionWorkspaceStep({
26
12
  runtime: params.runtime,
27
13
  sandboxId: params.sandboxId,
14
+ contextId: params.contextId,
15
+ executionId: params.executionId,
28
16
  files: [
29
17
  {
30
- path: sandboxFilePath,
31
- contentBase64: file.contentBase64,
18
+ fileId: params.fileId,
19
+ filename: params.filename,
20
+ mediaType: params.mediaType,
21
+ sourceEventId: params.sourceEventId,
22
+ sourcePartIndex: params.sourcePartIndex,
32
23
  },
33
24
  ],
34
25
  });
35
- console.log(`[FileParseContext ${params.datasetId}] Workstation created: ${workstation}`);
26
+ const sandboxFilePath = workspace.files[0]?.path ?? "";
27
+ if (!sandboxFilePath)
28
+ throw new Error("dataset_workspace_file_missing");
29
+ console.log(`[FileParseContext ${params.datasetId}] Context workspace created: ${workspace.root}`);
36
30
  console.log(`[FileParseContext ${params.datasetId}] File saved: ${sandboxFilePath}`);
37
31
  const state = {
38
32
  initialized: true,
39
33
  filePath: sandboxFilePath,
34
+ outputPath: `${workspace.outputDir}/${DATASET_OUTPUT_FILE_NAME}`,
35
+ scriptsDir: workspace.scriptsDir,
36
+ manifestPath: workspace.manifestPath,
40
37
  };
41
38
  return { filePath: sandboxFilePath, state };
42
39
  }
@@ -3,6 +3,9 @@ import type { FilePreviewContext } from "./filepreview.types.js";
3
3
  export type SandboxState = {
4
4
  initialized: boolean;
5
5
  filePath: string;
6
+ outputPath?: string;
7
+ scriptsDir?: string;
8
+ manifestPath?: string;
6
9
  };
7
10
  export type FileParseContext = {
8
11
  datasetId: string;
@@ -10,6 +13,9 @@ export type FileParseContext = {
10
13
  instructions: string;
11
14
  sandboxConfig: {
12
15
  filePath: string;
16
+ outputPath?: string;
17
+ scriptsDir?: string;
18
+ manifestPath?: string;
13
19
  };
14
20
  analysis: any[];
15
21
  schema: any | null;
@@ -29,6 +35,10 @@ export type FileParseContextParams = {
29
35
  sandboxState?: SandboxState;
30
36
  filePreview?: FilePreviewContext;
31
37
  schema?: any | null;
38
+ sourceEventId?: string;
39
+ sourcePartIndex?: number;
40
+ filename?: string;
41
+ mediaType?: string;
32
42
  };
33
43
  export type FileParseRunOptions = {
34
44
  prompt?: string;
@@ -11,13 +11,13 @@ function buildRole() {
11
11
  function buildGoal() {
12
12
  let xml = create()
13
13
  .ele("Goal")
14
- .txt("Convert the source file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
14
+ .txt("Convert the input file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
15
15
  .up();
16
16
  return xml.end({ prettyPrint: true, headless: true });
17
17
  }
18
- function buildSourceInfo(context) {
18
+ function buildResourceInfo(context) {
19
19
  let xml = create()
20
- .ele("Source")
20
+ .ele("FileResource")
21
21
  .ele("Type").txt("file").up()
22
22
  .ele("FileId").txt(context.fileId).up()
23
23
  .ele("DatasetId").txt(context.datasetId).up()
@@ -90,7 +90,7 @@ function buildErrorsSection(errors) {
90
90
  }
91
91
  let xml = create()
92
92
  .ele("PreviousErrors")
93
- .ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch source column names into schema keys piecemeal.").up();
93
+ .ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch input column names into schema keys piecemeal.").up();
94
94
  for (const error of errors) {
95
95
  xml = xml.ele("Error").txt(error).up();
96
96
  }
@@ -100,8 +100,8 @@ function buildErrorsSection(errors) {
100
100
  function buildContextSection(context) {
101
101
  let xml = create()
102
102
  .ele("Context");
103
- const sourceXml = buildSourceInfo(context);
104
- xml = xml.import(sourceXml.first());
103
+ const resourceXml = buildResourceInfo(context);
104
+ xml = xml.import(resourceXml.first());
105
105
  if (context.filePreview) {
106
106
  const previewXml = buildFilePreviewSection(context.filePreview);
107
107
  xml = xml.import(previewXml.first());
@@ -195,9 +195,9 @@ function buildSchemaSection(context) {
195
195
  xml = xml
196
196
  .ele("SchemaContract")
197
197
  .ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
198
- .ele("Rule").txt("Use only schema property keys in data objects. Source headers are input labels, not output keys.").up()
198
+ .ele("Rule").txt("Use only schema property keys in data objects. Input headers are input labels, not output keys.").up()
199
199
  .ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
200
- .ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize source labels to the closest valid enum literal; never emit a value outside the enum.").up();
200
+ .ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize input labels to the closest valid enum literal; never emit a value outside the enum.").up();
201
201
  xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
202
202
  xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
203
203
  let enumsXml = xml.ele("EnumConstraints");
@@ -223,8 +223,10 @@ function buildSchemaSection(context) {
223
223
  return xml.end({ prettyPrint: true, headless: true });
224
224
  }
225
225
  function buildInstructions(context) {
226
- const datasetWorkstation = getDatasetWorkstation(context.datasetId);
227
- const outputPath = getDatasetOutputPath(context.datasetId);
226
+ const datasetWorkstation = context.sandboxConfig.scriptsDir
227
+ ? context.sandboxConfig.scriptsDir.replace(/\/scripts$/, "")
228
+ : getDatasetWorkstation(context.datasetId);
229
+ const outputPath = context.sandboxConfig.outputPath ?? getDatasetOutputPath(context.datasetId);
228
230
  const hasProvidedSchema = Boolean(context.schema?.schema);
229
231
  const currentTask = hasProvidedSchema
230
232
  ? "Review FilePreview section, use the provided schema as the output contract, then parse the file and generate the dataset"
@@ -243,10 +245,10 @@ function buildInstructions(context) {
243
245
  .ele("Requirements")
244
246
  .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
245
247
  .ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
246
- .ele("Requirement").txt("Build a schema-first mapping from source columns to schema fields before writing output.jsonl. Do not use raw source headers as JSON keys unless they are exactly schema keys").up()
248
+ .ele("Requirement").txt("Build a schema-first mapping from input columns to schema fields before writing output.jsonl. Do not use raw input headers as JSON keys unless they are exactly schema keys").up()
247
249
  .ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
248
250
  .ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
249
- .ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the source language").up()
251
+ .ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the input language").up()
250
252
  .ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
251
253
  .up()
252
254
  .up();
@@ -284,8 +286,8 @@ function buildInstructions(context) {
284
286
  .up()
285
287
  .ele("Rules")
286
288
  .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
287
- .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the source language").up()
288
- .ele("Rule").txt("Original/source language applies to extracted values only, not to JSON object keys").up()
289
+ .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
290
+ .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
289
291
  .ele("Rule").txt("Datasets contain ONLY data records; exclude all header sections and file metadata").up()
290
292
  .ele("Rule").txt("JSONL format: each line = separate JSON object representing one data record").up()
291
293
  .ele("Rule").txt("FilePreview shows raw file content - use Script to understand data extraction").up()
package/dist/index.d.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  export * from "./dataset.js";
2
+ export * from "./contextWorkspace.js";
2
3
  export * from "./domain.js";
3
4
  export * from "./materializeDataset.tool.js";
4
5
  export * from "./schema.js";
package/dist/index.js CHANGED
@@ -1,4 +1,5 @@
1
1
  export * from "./dataset.js";
2
+ export * from "./contextWorkspace.js";
2
3
  export * from "./domain.js";
3
4
  export * from "./materializeDataset.tool.js";
4
5
  export * from "./schema.js";