@ekairos/dataset 1.22.82-beta.development.0 → 1.22.84-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/agentMaterializers.d.ts +2 -2
- package/dist/builder/context.d.ts +7 -0
- package/dist/builder/context.js +192 -0
- package/dist/builder/instructions.d.ts +3 -3
- package/dist/builder/instructions.js +10 -10
- package/dist/builder/materialize.d.ts +12 -11
- package/dist/builder/materialize.js +122 -121
- package/dist/builder/materializeQuery.d.ts +3 -2
- package/dist/builder/materializeQuery.js +10 -19
- package/dist/builder/persistence.d.ts +4 -5
- package/dist/builder/persistence.js +20 -19
- package/dist/builder/types.d.ts +31 -24
- package/dist/completeDataset.steps.d.ts +9 -8
- package/dist/completeDataset.steps.js +18 -11
- package/dist/completeDataset.tool.d.ts +9 -8
- package/dist/completeDataset.tool.js +2 -1
- package/dist/contextWorkspace.d.ts +72 -0
- package/dist/contextWorkspace.js +218 -0
- package/dist/dataset.d.ts +1 -1
- package/dist/dataset.js +42 -29
- package/dist/datasetFiles.d.ts +1 -1
- package/dist/datasetFiles.js +3 -3
- package/dist/executeCommand.tool.d.ts +1 -43
- package/dist/executeCommand.tool.js +10 -3
- package/dist/file/file-dataset.agent.d.ts +2 -0
- package/dist/file/file-dataset.agent.js +51 -16
- package/dist/file/file-dataset.steps.d.ts +6 -0
- package/dist/file/file-dataset.steps.js +18 -21
- package/dist/file/file-dataset.types.d.ts +10 -0
- package/dist/file/prompts.js +16 -14
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/materializeDataset.tool.d.ts +34 -26
- package/dist/materializeDataset.tool.js +40 -29
- package/dist/schema.d.ts +12 -2
- package/dist/schema.js +6 -3
- package/dist/service.d.ts +2 -2
- package/dist/service.js +6 -3
- package/dist/transform/filepreview.d.ts +2 -2
- package/dist/transform/filepreview.js +3 -3
- package/dist/transform/prompts.js +25 -25
- package/dist/transform/transform-dataset.agent.d.ts +4 -4
- package/dist/transform/transform-dataset.agent.js +29 -30
- package/dist/transform/transform-dataset.steps.d.ts +7 -7
- package/dist/transform/transform-dataset.steps.js +20 -20
- package/dist/transform/transform-dataset.types.d.ts +13 -13
- package/dist/transform/transformDataset.js +4 -4
- package/package.json +4 -4
- /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
- /package/dist/builder/{sourceRows.js → rows.js} +0 -0
package/dist/dataset.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { buildObjectOutputInstructions } from "./builder/instructions.js";
|
|
2
|
+
import { resolveDatasetResourceContext } from "./builder/context.js";
|
|
2
3
|
import { createDatasetId } from "./id.js";
|
|
3
|
-
import { completeDatasetStep, materializeDerivedDataset,
|
|
4
|
-
import {
|
|
4
|
+
import { completeDatasetStep, materializeDerivedDataset, materializeSingleFileLikeResource, } from "./builder/materialize.js";
|
|
5
|
+
import { materializeQueryResource } from "./builder/materializeQuery.js";
|
|
5
6
|
import { createDatasetBuildResult, finalizeBuildResult, } from "./builder/persistence.js";
|
|
6
7
|
export function dataset(runtime, options = {}) {
|
|
7
8
|
const datasetId = normalizeDatasetId(options.datasetId);
|
|
@@ -9,7 +10,7 @@ export function dataset(runtime, options = {}) {
|
|
|
9
10
|
const state = {
|
|
10
11
|
runtime: typedRuntime,
|
|
11
12
|
env: typedRuntime.env,
|
|
12
|
-
|
|
13
|
+
resources: [],
|
|
13
14
|
output: "rows",
|
|
14
15
|
inferSchema: false,
|
|
15
16
|
durable: options.durable,
|
|
@@ -17,38 +18,46 @@ export function dataset(runtime, options = {}) {
|
|
|
17
18
|
};
|
|
18
19
|
const api = {
|
|
19
20
|
datasetId,
|
|
20
|
-
fromFile(
|
|
21
|
-
state.
|
|
21
|
+
fromFile(resource) {
|
|
22
|
+
state.resources.push({ kind: "file", ...resource });
|
|
22
23
|
return api;
|
|
23
24
|
},
|
|
24
|
-
fromText(
|
|
25
|
-
state.
|
|
25
|
+
fromText(resource) {
|
|
26
|
+
state.resources.push({ kind: "text", ...resource });
|
|
26
27
|
return api;
|
|
27
28
|
},
|
|
28
|
-
fromDataset(
|
|
29
|
-
state.
|
|
29
|
+
fromDataset(resource) {
|
|
30
|
+
state.resources.push({ kind: "dataset", ...resource });
|
|
30
31
|
return api;
|
|
31
32
|
},
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
fromContext(context) {
|
|
34
|
+
state.resources.push({ kind: "context", ...context });
|
|
35
|
+
return api;
|
|
36
|
+
},
|
|
37
|
+
from(...resources) {
|
|
38
|
+
for (const resource of resources) {
|
|
39
|
+
if ("kind" in resource) {
|
|
40
|
+
state.resources.push(resource);
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
if ("fileId" in resource) {
|
|
44
|
+
state.resources.push({ kind: "file", ...resource });
|
|
36
45
|
continue;
|
|
37
46
|
}
|
|
38
|
-
if ("
|
|
39
|
-
state.
|
|
47
|
+
if ("datasetId" in resource) {
|
|
48
|
+
state.resources.push({ kind: "dataset", ...resource });
|
|
40
49
|
continue;
|
|
41
50
|
}
|
|
42
|
-
if ("
|
|
43
|
-
state.
|
|
51
|
+
if ("id" in resource || "key" in resource) {
|
|
52
|
+
state.resources.push({ kind: "context", ...resource });
|
|
44
53
|
continue;
|
|
45
54
|
}
|
|
46
|
-
state.
|
|
55
|
+
state.resources.push({ kind: "text", ...resource });
|
|
47
56
|
}
|
|
48
57
|
return api;
|
|
49
58
|
},
|
|
50
|
-
fromQuery(domain,
|
|
51
|
-
state.
|
|
59
|
+
fromQuery(domain, resource) {
|
|
60
|
+
state.resources.push({ kind: "query", domain, ...resource });
|
|
52
61
|
return api;
|
|
53
62
|
},
|
|
54
63
|
title(title) {
|
|
@@ -96,8 +105,8 @@ export function dataset(runtime, options = {}) {
|
|
|
96
105
|
return api;
|
|
97
106
|
},
|
|
98
107
|
async build(options) {
|
|
99
|
-
if (state.
|
|
100
|
-
throw new Error("
|
|
108
|
+
if (state.resources.length === 0) {
|
|
109
|
+
throw new Error("dataset_resources_required");
|
|
101
110
|
}
|
|
102
111
|
const targetDatasetId = options?.datasetId
|
|
103
112
|
? normalizeDatasetId(options.datasetId)
|
|
@@ -106,6 +115,9 @@ export function dataset(runtime, options = {}) {
|
|
|
106
115
|
...state,
|
|
107
116
|
durable: options?.durable ?? state.durable,
|
|
108
117
|
};
|
|
118
|
+
const context = await resolveDatasetResourceContext(typedRuntime, targetDatasetId, stateWithBuildOptions.resources);
|
|
119
|
+
stateWithBuildOptions.resources = context.resources;
|
|
120
|
+
stateWithBuildOptions.contextId = context.contextId;
|
|
109
121
|
const effectiveState = stateWithBuildOptions.output === "object"
|
|
110
122
|
? {
|
|
111
123
|
...stateWithBuildOptions,
|
|
@@ -113,25 +125,26 @@ export function dataset(runtime, options = {}) {
|
|
|
113
125
|
instructions: buildObjectOutputInstructions(stateWithBuildOptions.instructions),
|
|
114
126
|
}
|
|
115
127
|
: stateWithBuildOptions;
|
|
116
|
-
const
|
|
117
|
-
const
|
|
128
|
+
const onlyResource = effectiveState.resources[0];
|
|
129
|
+
const isSingleResource = effectiveState.resources.length === 1;
|
|
118
130
|
const hasInstructions = Boolean(String(effectiveState.instructions ?? "").trim());
|
|
119
|
-
if (
|
|
120
|
-
await
|
|
131
|
+
if (isSingleResource && onlyResource.kind === "query" && !hasInstructions) {
|
|
132
|
+
await materializeQueryResource(effectiveState.runtime, onlyResource, {
|
|
121
133
|
datasetId: targetDatasetId,
|
|
122
134
|
sandboxId: effectiveState.sandboxId,
|
|
123
135
|
schema: effectiveState.outputSchema,
|
|
124
|
-
title: effectiveState.title ??
|
|
136
|
+
title: effectiveState.title ?? onlyResource.title,
|
|
125
137
|
instructions: effectiveState.instructions,
|
|
126
138
|
first: effectiveState.first,
|
|
139
|
+
contextId: effectiveState.contextId ?? "",
|
|
127
140
|
});
|
|
128
141
|
return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
129
142
|
}
|
|
130
|
-
if (
|
|
143
|
+
if (isSingleResource && (onlyResource.kind === "file" || onlyResource.kind === "text")) {
|
|
131
144
|
if (!effectiveState.reactor) {
|
|
132
145
|
throw new Error("dataset_reactor_required");
|
|
133
146
|
}
|
|
134
|
-
await
|
|
147
|
+
await materializeSingleFileLikeResource(effectiveState, onlyResource, targetDatasetId);
|
|
135
148
|
const completed = await completeDatasetStep({
|
|
136
149
|
runtime: effectiveState.runtime,
|
|
137
150
|
datasetId: targetDatasetId,
|
package/dist/datasetFiles.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export declare const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
|
|
2
2
|
export declare function getDatasetWorkdirBase(): string;
|
|
3
3
|
export declare function getDatasetWorkstation(datasetId: string): string;
|
|
4
|
-
export declare function
|
|
4
|
+
export declare function getDatasetResourcesDir(datasetId: string): string;
|
|
5
5
|
export declare function getDatasetScriptsDir(datasetId: string): string;
|
|
6
6
|
export declare function getDatasetArtifactsDir(datasetId: string): string;
|
|
7
7
|
export declare function getDatasetLogsDir(datasetId: string): string;
|
package/dist/datasetFiles.js
CHANGED
|
@@ -9,8 +9,8 @@ export function getDatasetWorkdirBase() {
|
|
|
9
9
|
export function getDatasetWorkstation(datasetId) {
|
|
10
10
|
return `${getDatasetWorkdirBase()}/${datasetId}`;
|
|
11
11
|
}
|
|
12
|
-
export function
|
|
13
|
-
return `${getDatasetWorkstation(datasetId)}/
|
|
12
|
+
export function getDatasetResourcesDir(datasetId) {
|
|
13
|
+
return `${getDatasetWorkstation(datasetId)}/resources`;
|
|
14
14
|
}
|
|
15
15
|
export function getDatasetScriptsDir(datasetId) {
|
|
16
16
|
return `${getDatasetWorkstation(datasetId)}/scripts`;
|
|
@@ -24,7 +24,7 @@ export function getDatasetLogsDir(datasetId) {
|
|
|
24
24
|
export function getDatasetStandardDirs(datasetId) {
|
|
25
25
|
return [
|
|
26
26
|
getDatasetWorkstation(datasetId),
|
|
27
|
-
|
|
27
|
+
getDatasetResourcesDir(datasetId),
|
|
28
28
|
getDatasetScriptsDir(datasetId),
|
|
29
29
|
getDatasetArtifactsDir(datasetId),
|
|
30
30
|
getDatasetLogsDir(datasetId),
|
|
@@ -6,47 +6,5 @@ interface ExecuteCommandToolParams {
|
|
|
6
6
|
export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime }: ExecuteCommandToolParams): import("ai").Tool<{
|
|
7
7
|
pythonCode: string;
|
|
8
8
|
scriptName: string;
|
|
9
|
-
},
|
|
10
|
-
success: boolean;
|
|
11
|
-
fatal: boolean;
|
|
12
|
-
status: string;
|
|
13
|
-
error: string;
|
|
14
|
-
stdout: string;
|
|
15
|
-
stderr: string;
|
|
16
|
-
exitCode: number;
|
|
17
|
-
scriptPath: string;
|
|
18
|
-
stdoutTruncated: boolean;
|
|
19
|
-
stderrTruncated: boolean;
|
|
20
|
-
stdoutOriginalLength: number;
|
|
21
|
-
stderrOriginalLength: number;
|
|
22
|
-
message?: undefined;
|
|
23
|
-
} | {
|
|
24
|
-
success: boolean;
|
|
25
|
-
exitCode: number;
|
|
26
|
-
stdout: string;
|
|
27
|
-
stderr: string;
|
|
28
|
-
scriptPath: string;
|
|
29
|
-
error: string;
|
|
30
|
-
stdoutTruncated: boolean;
|
|
31
|
-
stderrTruncated: boolean;
|
|
32
|
-
stdoutOriginalLength: number;
|
|
33
|
-
stderrOriginalLength: number;
|
|
34
|
-
fatal?: undefined;
|
|
35
|
-
status?: undefined;
|
|
36
|
-
message?: undefined;
|
|
37
|
-
} | {
|
|
38
|
-
success: boolean;
|
|
39
|
-
exitCode: number;
|
|
40
|
-
stdout: string;
|
|
41
|
-
stderr: string;
|
|
42
|
-
scriptPath: string;
|
|
43
|
-
message: string;
|
|
44
|
-
stdoutTruncated: boolean;
|
|
45
|
-
stderrTruncated: boolean;
|
|
46
|
-
stdoutOriginalLength: number;
|
|
47
|
-
stderrOriginalLength: number;
|
|
48
|
-
fatal?: undefined;
|
|
49
|
-
status?: undefined;
|
|
50
|
-
error?: undefined;
|
|
51
|
-
}>;
|
|
9
|
+
}, unknown>;
|
|
52
10
|
export {};
|
|
@@ -2,6 +2,7 @@ import { tool } from "ai";
|
|
|
2
2
|
import { z } from "zod";
|
|
3
3
|
import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFilesStep } from "./sandbox/steps.js";
|
|
4
4
|
import { getDatasetScriptsDir } from "./datasetFiles.js";
|
|
5
|
+
import { getContextExecutionWorkspaceDirs } from "./contextWorkspace.js";
|
|
5
6
|
// To keep responses predictable for big data scenarios, we cap stdout/stderr.
|
|
6
7
|
// The tool's return payload exposes stdout (capped) plus the on-disk script path.
|
|
7
8
|
const MAX_STDOUT_CHARS = 20000;
|
|
@@ -29,10 +30,16 @@ export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
|
|
|
29
30
|
pythonCode: z.string().describe("Python code to execute. Saved to a file before running. MANDATORY: Use print() to report progress and final results. Keep prints concise; avoid dumping rows/JSON. For large outputs, write to files in the workstation directory and print only file paths and brief summaries."),
|
|
30
31
|
scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A deterministic suffix will be appended automatically."),
|
|
31
32
|
}),
|
|
32
|
-
execute: async ({ pythonCode, scriptName }) => {
|
|
33
|
+
execute: (async ({ pythonCode, scriptName }, actionContext) => {
|
|
33
34
|
const normalizedScriptName = normalizeScriptName(scriptName);
|
|
34
35
|
const scriptHash = stableScriptHash(`${normalizedScriptName}\0${pythonCode}`);
|
|
35
|
-
const
|
|
36
|
+
const scriptsDir = actionContext?.contextId && actionContext.executionId
|
|
37
|
+
? getContextExecutionWorkspaceDirs({
|
|
38
|
+
contextId: actionContext.contextId,
|
|
39
|
+
executionId: actionContext.executionId,
|
|
40
|
+
}).scriptsDir
|
|
41
|
+
: getDatasetScriptsDir(datasetId);
|
|
42
|
+
const scriptFile = `${scriptsDir}/${normalizedScriptName}-${scriptHash}.py`;
|
|
36
43
|
console.log(`[Dataset ${datasetId}] ========================================`);
|
|
37
44
|
console.log(`[Dataset ${datasetId}] Tool: executeCommand`);
|
|
38
45
|
console.log(`[Dataset ${datasetId}] Script: ${normalizedScriptName}`);
|
|
@@ -162,6 +169,6 @@ export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
|
|
|
162
169
|
stderrOriginalLength: 0,
|
|
163
170
|
};
|
|
164
171
|
}
|
|
165
|
-
},
|
|
172
|
+
}),
|
|
166
173
|
});
|
|
167
174
|
}
|
|
@@ -12,6 +12,8 @@ export declare function createFileParseContext<Env extends {
|
|
|
12
12
|
sandboxState?: SandboxState;
|
|
13
13
|
filePreview?: FileParseContext["filePreview"];
|
|
14
14
|
schema?: any | null;
|
|
15
|
+
filename?: string;
|
|
16
|
+
mediaType?: string;
|
|
15
17
|
}): {
|
|
16
18
|
datasetId: string;
|
|
17
19
|
parse(runtime: {
|
|
@@ -4,7 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
|
|
|
4
4
|
import { datasetGetByIdStep } from "../dataset/steps.js";
|
|
5
5
|
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
6
6
|
import { createGenerateSchemaTool } from "./generateSchema.tool.js";
|
|
7
|
-
import { buildFileDatasetPromptStep,
|
|
7
|
+
import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
|
|
8
8
|
import { createDatasetId } from "../id.js";
|
|
9
9
|
async function awaitContextRun(run) {
|
|
10
10
|
if (!run)
|
|
@@ -27,6 +27,15 @@ function createFileParseContextDefinition(params) {
|
|
|
27
27
|
const fileId = previous?.fileId ?? params.fileId ?? "";
|
|
28
28
|
const instructions = previous?.instructions ?? params.instructions ?? "";
|
|
29
29
|
const sandboxId = previous?.sandboxId ?? params.sandboxId ?? "";
|
|
30
|
+
const contextRun = runtime?.__ekairosContextRun ?? {};
|
|
31
|
+
const contextId = String(contextRun.contextId ?? stored?.id ?? "").trim();
|
|
32
|
+
const executionId = String(contextRun.executionId ?? previous?.executionId ?? "").trim();
|
|
33
|
+
const sourceEventId = String(previous?.sourceEventId ?? params.sourceEventId ?? "").trim();
|
|
34
|
+
const sourcePartIndex = typeof previous?.sourcePartIndex === "number"
|
|
35
|
+
? previous.sourcePartIndex
|
|
36
|
+
: typeof params.sourcePartIndex === "number"
|
|
37
|
+
? params.sourcePartIndex
|
|
38
|
+
: 0;
|
|
30
39
|
if (!datasetId) {
|
|
31
40
|
throw new Error("dataset_id_required");
|
|
32
41
|
}
|
|
@@ -36,30 +45,29 @@ function createFileParseContextDefinition(params) {
|
|
|
36
45
|
if (!sandboxId) {
|
|
37
46
|
throw new Error("dataset_sandbox_required");
|
|
38
47
|
}
|
|
48
|
+
if (!contextId) {
|
|
49
|
+
throw new Error("dataset_context_id_required");
|
|
50
|
+
}
|
|
51
|
+
if (!executionId) {
|
|
52
|
+
throw new Error("dataset_execution_id_required");
|
|
53
|
+
}
|
|
39
54
|
const initialized = sandboxState.initialized && sandboxState.filePath
|
|
40
55
|
? { filePath: sandboxState.filePath, state: sandboxState }
|
|
41
56
|
: await initializeFileParseSandboxStep({
|
|
42
57
|
runtime,
|
|
43
58
|
sandboxId,
|
|
59
|
+
contextId,
|
|
60
|
+
executionId,
|
|
44
61
|
datasetId,
|
|
45
62
|
fileId,
|
|
63
|
+
sourceEventId,
|
|
64
|
+
sourcePartIndex,
|
|
65
|
+
filename: previous?.filename ?? params.filename,
|
|
66
|
+
mediaType: previous?.mediaType ?? params.mediaType,
|
|
46
67
|
state: sandboxState,
|
|
47
68
|
});
|
|
48
69
|
const sandboxFilePath = initialized.filePath;
|
|
49
70
|
let filePreview = previous?.filePreview ?? previous?.ctx?.filePreview ?? params.filePreview;
|
|
50
|
-
if (!filePreview) {
|
|
51
|
-
try {
|
|
52
|
-
filePreview = await generateFileParsePreviewStep({
|
|
53
|
-
runtime,
|
|
54
|
-
sandboxId,
|
|
55
|
-
sandboxFilePath,
|
|
56
|
-
datasetId,
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
catch {
|
|
60
|
-
// Preview is optional; parsing can still proceed from the file path.
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
71
|
let schema = previous?.ctx?.schema ?? previous?.schema ?? params.schema ?? null;
|
|
64
72
|
const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
|
|
65
73
|
if (datasetResult.ok && datasetResult.data.schema) {
|
|
@@ -69,7 +77,12 @@ function createFileParseContextDefinition(params) {
|
|
|
69
77
|
datasetId,
|
|
70
78
|
fileId,
|
|
71
79
|
instructions,
|
|
72
|
-
sandboxConfig: {
|
|
80
|
+
sandboxConfig: {
|
|
81
|
+
filePath: sandboxFilePath,
|
|
82
|
+
outputPath: initialized.state.outputPath,
|
|
83
|
+
scriptsDir: initialized.state.scriptsDir,
|
|
84
|
+
manifestPath: initialized.state.manifestPath,
|
|
85
|
+
},
|
|
73
86
|
analysis: [],
|
|
74
87
|
schema,
|
|
75
88
|
plan: null,
|
|
@@ -84,6 +97,11 @@ function createFileParseContextDefinition(params) {
|
|
|
84
97
|
fileId,
|
|
85
98
|
instructions,
|
|
86
99
|
sandboxId,
|
|
100
|
+
executionId,
|
|
101
|
+
sourceEventId,
|
|
102
|
+
sourcePartIndex,
|
|
103
|
+
filename: previous?.filename ?? params.filename,
|
|
104
|
+
mediaType: previous?.mediaType ?? params.mediaType,
|
|
87
105
|
sandboxState: initialized.state,
|
|
88
106
|
filePreview,
|
|
89
107
|
ctx,
|
|
@@ -109,6 +127,7 @@ function createFileParseContextDefinition(params) {
|
|
|
109
127
|
const datasetId = _stored?.content?.datasetId ?? fallbackDatasetId ?? "";
|
|
110
128
|
const fileId = _stored?.content?.fileId ?? params.fileId ?? "";
|
|
111
129
|
const sandboxId = _stored?.content?.sandboxId ?? params.sandboxId ?? "";
|
|
130
|
+
const outputPath = _stored?.content?.ctx?.sandboxConfig?.outputPath;
|
|
112
131
|
if (!datasetId)
|
|
113
132
|
throw new Error("dataset_id_required");
|
|
114
133
|
if (!fileId)
|
|
@@ -125,6 +144,7 @@ function createFileParseContextDefinition(params) {
|
|
|
125
144
|
datasetId,
|
|
126
145
|
sandboxId,
|
|
127
146
|
runtime,
|
|
147
|
+
outputPath,
|
|
128
148
|
}),
|
|
129
149
|
clearDataset: createClearDatasetTool({
|
|
130
150
|
datasetId,
|
|
@@ -169,6 +189,8 @@ export function createFileParseContext(fileId, opts) {
|
|
|
169
189
|
sandboxState: opts?.sandboxState,
|
|
170
190
|
filePreview: opts?.filePreview,
|
|
171
191
|
schema: opts?.schema,
|
|
192
|
+
filename: opts?.filename,
|
|
193
|
+
mediaType: opts?.mediaType,
|
|
172
194
|
};
|
|
173
195
|
const { context } = createFileParseContextDefinition(params);
|
|
174
196
|
return {
|
|
@@ -185,15 +207,24 @@ export function createFileParseContext(fileId, opts) {
|
|
|
185
207
|
type: "text",
|
|
186
208
|
text: options.prompt ?? "generate a dataset for this file",
|
|
187
209
|
},
|
|
210
|
+
{
|
|
211
|
+
type: "file",
|
|
212
|
+
fileId,
|
|
213
|
+
filename: opts?.filename ?? "resource-file",
|
|
214
|
+
mediaType: opts?.mediaType ?? "application/octet-stream",
|
|
215
|
+
},
|
|
188
216
|
],
|
|
189
217
|
},
|
|
190
218
|
};
|
|
219
|
+
params.sourceEventId = triggerEvent.id;
|
|
220
|
+
params.sourcePartIndex = 1;
|
|
221
|
+
params.filename = opts?.filename ?? "resource-file";
|
|
222
|
+
params.mediaType = opts?.mediaType ?? "application/octet-stream";
|
|
191
223
|
const shell = await context.react(triggerEvent, {
|
|
192
224
|
runtime: runtime,
|
|
193
225
|
context: { key: `dataset:${datasetId}` },
|
|
194
226
|
durable: options.durable ?? false,
|
|
195
227
|
options: {
|
|
196
|
-
silent: true,
|
|
197
228
|
preventClose: true,
|
|
198
229
|
sendFinish: false,
|
|
199
230
|
maxIterations: 20,
|
|
@@ -203,6 +234,10 @@ export function createFileParseContext(fileId, opts) {
|
|
|
203
234
|
...(options.initialContent ?? {}),
|
|
204
235
|
datasetId,
|
|
205
236
|
fileId,
|
|
237
|
+
sourceEventId: triggerEvent.id,
|
|
238
|
+
sourcePartIndex: 1,
|
|
239
|
+
filename: opts?.filename ?? "resource-file",
|
|
240
|
+
mediaType: opts?.mediaType ?? "application/octet-stream",
|
|
206
241
|
instructions: opts?.instructions ?? "",
|
|
207
242
|
sandboxId: opts?.sandboxId ?? "",
|
|
208
243
|
sandboxState: opts?.sandboxState ?? { initialized: false, filePath: "" },
|
|
@@ -3,8 +3,14 @@ import type { FilePreviewContext } from "./filepreview.types.js";
|
|
|
3
3
|
export declare function initializeFileParseSandboxStep(params: {
|
|
4
4
|
runtime: any;
|
|
5
5
|
sandboxId: string;
|
|
6
|
+
contextId: string;
|
|
7
|
+
executionId: string;
|
|
6
8
|
datasetId: string;
|
|
7
9
|
fileId: string;
|
|
10
|
+
sourceEventId?: string;
|
|
11
|
+
sourcePartIndex?: number;
|
|
12
|
+
filename?: string;
|
|
13
|
+
mediaType?: string;
|
|
8
14
|
state: SandboxState;
|
|
9
15
|
}): Promise<{
|
|
10
16
|
filePath: string;
|
|
@@ -1,42 +1,39 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { DATASET_OUTPUT_FILE_NAME } from "../datasetFiles.js";
|
|
2
|
+
import { prepareContextExecutionWorkspaceStep } from "../contextWorkspace.js";
|
|
3
3
|
import { buildFileDatasetPrompt } from "./prompts.js";
|
|
4
4
|
import { generateFilePreview } from "./filepreview.js";
|
|
5
|
-
import { readInstantFileStep } from "./steps.js";
|
|
6
5
|
export async function initializeFileParseSandboxStep(params) {
|
|
7
6
|
"use step";
|
|
8
7
|
if (params.state.initialized) {
|
|
9
8
|
return { filePath: params.state.filePath, state: params.state };
|
|
10
9
|
}
|
|
11
|
-
console.log(`[FileParseContext ${params.datasetId}] Preparing
|
|
12
|
-
|
|
13
|
-
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
14
|
-
console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
|
|
15
|
-
const workstation = getDatasetWorkstation(params.datasetId);
|
|
16
|
-
await runDatasetSandboxCommandStep({
|
|
17
|
-
runtime: params.runtime,
|
|
18
|
-
sandboxId: params.sandboxId,
|
|
19
|
-
cmd: "mkdir",
|
|
20
|
-
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
21
|
-
});
|
|
22
|
-
const fileName = file.contentDisposition ?? "";
|
|
23
|
-
const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
|
|
24
|
-
const sandboxFilePath = `${getDatasetSourcesDir(params.datasetId)}/${params.fileId}${fileExtension}`;
|
|
25
|
-
await writeDatasetSandboxFilesStep({
|
|
10
|
+
console.log(`[FileParseContext ${params.datasetId}] Preparing context execution workspace...`);
|
|
11
|
+
const workspace = await prepareContextExecutionWorkspaceStep({
|
|
26
12
|
runtime: params.runtime,
|
|
27
13
|
sandboxId: params.sandboxId,
|
|
14
|
+
contextId: params.contextId,
|
|
15
|
+
executionId: params.executionId,
|
|
28
16
|
files: [
|
|
29
17
|
{
|
|
30
|
-
|
|
31
|
-
|
|
18
|
+
fileId: params.fileId,
|
|
19
|
+
filename: params.filename,
|
|
20
|
+
mediaType: params.mediaType,
|
|
21
|
+
sourceEventId: params.sourceEventId,
|
|
22
|
+
sourcePartIndex: params.sourcePartIndex,
|
|
32
23
|
},
|
|
33
24
|
],
|
|
34
25
|
});
|
|
35
|
-
|
|
26
|
+
const sandboxFilePath = workspace.files[0]?.path ?? "";
|
|
27
|
+
if (!sandboxFilePath)
|
|
28
|
+
throw new Error("dataset_workspace_file_missing");
|
|
29
|
+
console.log(`[FileParseContext ${params.datasetId}] Context workspace created: ${workspace.root}`);
|
|
36
30
|
console.log(`[FileParseContext ${params.datasetId}] File saved: ${sandboxFilePath}`);
|
|
37
31
|
const state = {
|
|
38
32
|
initialized: true,
|
|
39
33
|
filePath: sandboxFilePath,
|
|
34
|
+
outputPath: `${workspace.outputDir}/${DATASET_OUTPUT_FILE_NAME}`,
|
|
35
|
+
scriptsDir: workspace.scriptsDir,
|
|
36
|
+
manifestPath: workspace.manifestPath,
|
|
40
37
|
};
|
|
41
38
|
return { filePath: sandboxFilePath, state };
|
|
42
39
|
}
|
|
@@ -3,6 +3,9 @@ import type { FilePreviewContext } from "./filepreview.types.js";
|
|
|
3
3
|
export type SandboxState = {
|
|
4
4
|
initialized: boolean;
|
|
5
5
|
filePath: string;
|
|
6
|
+
outputPath?: string;
|
|
7
|
+
scriptsDir?: string;
|
|
8
|
+
manifestPath?: string;
|
|
6
9
|
};
|
|
7
10
|
export type FileParseContext = {
|
|
8
11
|
datasetId: string;
|
|
@@ -10,6 +13,9 @@ export type FileParseContext = {
|
|
|
10
13
|
instructions: string;
|
|
11
14
|
sandboxConfig: {
|
|
12
15
|
filePath: string;
|
|
16
|
+
outputPath?: string;
|
|
17
|
+
scriptsDir?: string;
|
|
18
|
+
manifestPath?: string;
|
|
13
19
|
};
|
|
14
20
|
analysis: any[];
|
|
15
21
|
schema: any | null;
|
|
@@ -29,6 +35,10 @@ export type FileParseContextParams = {
|
|
|
29
35
|
sandboxState?: SandboxState;
|
|
30
36
|
filePreview?: FilePreviewContext;
|
|
31
37
|
schema?: any | null;
|
|
38
|
+
sourceEventId?: string;
|
|
39
|
+
sourcePartIndex?: number;
|
|
40
|
+
filename?: string;
|
|
41
|
+
mediaType?: string;
|
|
32
42
|
};
|
|
33
43
|
export type FileParseRunOptions = {
|
|
34
44
|
prompt?: string;
|
package/dist/file/prompts.js
CHANGED
|
@@ -11,13 +11,13 @@ function buildRole() {
|
|
|
11
11
|
function buildGoal() {
|
|
12
12
|
let xml = create()
|
|
13
13
|
.ele("Goal")
|
|
14
|
-
.txt("Convert the
|
|
14
|
+
.txt("Convert the input file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
|
|
15
15
|
.up();
|
|
16
16
|
return xml.end({ prettyPrint: true, headless: true });
|
|
17
17
|
}
|
|
18
|
-
function
|
|
18
|
+
function buildResourceInfo(context) {
|
|
19
19
|
let xml = create()
|
|
20
|
-
.ele("
|
|
20
|
+
.ele("FileResource")
|
|
21
21
|
.ele("Type").txt("file").up()
|
|
22
22
|
.ele("FileId").txt(context.fileId).up()
|
|
23
23
|
.ele("DatasetId").txt(context.datasetId).up()
|
|
@@ -90,7 +90,7 @@ function buildErrorsSection(errors) {
|
|
|
90
90
|
}
|
|
91
91
|
let xml = create()
|
|
92
92
|
.ele("PreviousErrors")
|
|
93
|
-
.ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch
|
|
93
|
+
.ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch input column names into schema keys piecemeal.").up();
|
|
94
94
|
for (const error of errors) {
|
|
95
95
|
xml = xml.ele("Error").txt(error).up();
|
|
96
96
|
}
|
|
@@ -100,8 +100,8 @@ function buildErrorsSection(errors) {
|
|
|
100
100
|
function buildContextSection(context) {
|
|
101
101
|
let xml = create()
|
|
102
102
|
.ele("Context");
|
|
103
|
-
const
|
|
104
|
-
xml = xml.import(
|
|
103
|
+
const resourceXml = buildResourceInfo(context);
|
|
104
|
+
xml = xml.import(resourceXml.first());
|
|
105
105
|
if (context.filePreview) {
|
|
106
106
|
const previewXml = buildFilePreviewSection(context.filePreview);
|
|
107
107
|
xml = xml.import(previewXml.first());
|
|
@@ -195,9 +195,9 @@ function buildSchemaSection(context) {
|
|
|
195
195
|
xml = xml
|
|
196
196
|
.ele("SchemaContract")
|
|
197
197
|
.ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
|
|
198
|
-
.ele("Rule").txt("Use only schema property keys in data objects.
|
|
198
|
+
.ele("Rule").txt("Use only schema property keys in data objects. Input headers are input labels, not output keys.").up()
|
|
199
199
|
.ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
|
|
200
|
-
.ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize
|
|
200
|
+
.ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize input labels to the closest valid enum literal; never emit a value outside the enum.").up();
|
|
201
201
|
xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
|
|
202
202
|
xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
|
|
203
203
|
let enumsXml = xml.ele("EnumConstraints");
|
|
@@ -223,8 +223,10 @@ function buildSchemaSection(context) {
|
|
|
223
223
|
return xml.end({ prettyPrint: true, headless: true });
|
|
224
224
|
}
|
|
225
225
|
function buildInstructions(context) {
|
|
226
|
-
const datasetWorkstation =
|
|
227
|
-
|
|
226
|
+
const datasetWorkstation = context.sandboxConfig.scriptsDir
|
|
227
|
+
? context.sandboxConfig.scriptsDir.replace(/\/scripts$/, "")
|
|
228
|
+
: getDatasetWorkstation(context.datasetId);
|
|
229
|
+
const outputPath = context.sandboxConfig.outputPath ?? getDatasetOutputPath(context.datasetId);
|
|
228
230
|
const hasProvidedSchema = Boolean(context.schema?.schema);
|
|
229
231
|
const currentTask = hasProvidedSchema
|
|
230
232
|
? "Review FilePreview section, use the provided schema as the output contract, then parse the file and generate the dataset"
|
|
@@ -243,10 +245,10 @@ function buildInstructions(context) {
|
|
|
243
245
|
.ele("Requirements")
|
|
244
246
|
.ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
|
|
245
247
|
.ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
|
|
246
|
-
.ele("Requirement").txt("Build a schema-first mapping from
|
|
248
|
+
.ele("Requirement").txt("Build a schema-first mapping from input columns to schema fields before writing output.jsonl. Do not use raw input headers as JSON keys unless they are exactly schema keys").up()
|
|
247
249
|
.ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
|
|
248
250
|
.ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
|
|
249
|
-
.ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the
|
|
251
|
+
.ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the input language").up()
|
|
250
252
|
.ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
|
|
251
253
|
.up()
|
|
252
254
|
.up();
|
|
@@ -284,8 +286,8 @@ function buildInstructions(context) {
|
|
|
284
286
|
.up()
|
|
285
287
|
.ele("Rules")
|
|
286
288
|
.ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
|
|
287
|
-
.ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the
|
|
288
|
-
.ele("Rule").txt("Original/
|
|
289
|
+
.ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
|
|
290
|
+
.ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
|
|
289
291
|
.ele("Rule").txt("Datasets contain ONLY data records; exclude all header sections and file metadata").up()
|
|
290
292
|
.ele("Rule").txt("JSONL format: each line = separate JSON object representing one data record").up()
|
|
291
293
|
.ele("Rule").txt("FilePreview shows raw file content - use Script to understand data extraction").up()
|
package/dist/index.d.ts
CHANGED