@ekairos/dataset 1.22.83-beta.development.0 → 1.22.85-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/agentMaterializers.d.ts +2 -2
- package/dist/builder/context.d.ts +7 -0
- package/dist/builder/context.js +192 -0
- package/dist/builder/instructions.d.ts +3 -3
- package/dist/builder/instructions.js +10 -10
- package/dist/builder/materialize.d.ts +10 -11
- package/dist/builder/materialize.js +116 -113
- package/dist/builder/materializeQuery.d.ts +3 -2
- package/dist/builder/materializeQuery.js +10 -19
- package/dist/builder/persistence.d.ts +4 -5
- package/dist/builder/persistence.js +20 -19
- package/dist/builder/types.d.ts +29 -24
- package/dist/completeDataset.steps.js +1 -1
- package/dist/dataset.d.ts +1 -1
- package/dist/dataset.js +42 -29
- package/dist/datasetFiles.d.ts +1 -1
- package/dist/datasetFiles.js +3 -3
- package/dist/file/file-dataset.agent.js +3 -4
- package/dist/file/prompts.js +12 -12
- package/dist/materializeDataset.tool.d.ts +34 -26
- package/dist/materializeDataset.tool.js +40 -29
- package/dist/schema.d.ts +12 -2
- package/dist/schema.js +6 -3
- package/dist/service.d.ts +1 -2
- package/dist/service.js +5 -2
- package/dist/transform/filepreview.d.ts +2 -2
- package/dist/transform/filepreview.js +3 -3
- package/dist/transform/prompts.js +25 -25
- package/dist/transform/transform-dataset.agent.d.ts +4 -4
- package/dist/transform/transform-dataset.agent.js +29 -30
- package/dist/transform/transform-dataset.steps.d.ts +7 -7
- package/dist/transform/transform-dataset.steps.js +20 -20
- package/dist/transform/transform-dataset.types.d.ts +13 -13
- package/dist/transform/transformDataset.js +4 -4
- package/package.json +4 -4
- /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
- /package/dist/builder/{sourceRows.js → rows.js} +0 -0
package/dist/schema.d.ts
CHANGED
|
@@ -8,8 +8,6 @@ declare const entities: {
|
|
|
8
8
|
updatedAt: import("@instantdb/core").DataAttrDef<number, false, false, false>;
|
|
9
9
|
organizationId: import("@instantdb/core").DataAttrDef<string, false, true, false>;
|
|
10
10
|
title: import("@instantdb/core").DataAttrDef<string, false, false, false>;
|
|
11
|
-
sources: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
12
|
-
sourceKinds: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
13
11
|
instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
|
|
14
12
|
analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
15
13
|
schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
@@ -47,6 +45,18 @@ declare const links: {
|
|
|
47
45
|
readonly label: "datasets";
|
|
48
46
|
};
|
|
49
47
|
};
|
|
48
|
+
readonly dataset_datasetsContext: {
|
|
49
|
+
readonly forward: {
|
|
50
|
+
readonly on: "dataset_datasets";
|
|
51
|
+
readonly has: "one";
|
|
52
|
+
readonly label: "context";
|
|
53
|
+
};
|
|
54
|
+
readonly reverse: {
|
|
55
|
+
readonly on: "event_contexts";
|
|
56
|
+
readonly has: "many";
|
|
57
|
+
readonly label: "datasets";
|
|
58
|
+
};
|
|
59
|
+
};
|
|
50
60
|
};
|
|
51
61
|
declare const rooms: {};
|
|
52
62
|
export declare const datasetDomain: DomainSchemaResult<typeof entities, typeof links, typeof rooms, {}, "dataset", "dataset">;
|
package/dist/schema.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { i } from "@instantdb/core";
|
|
2
2
|
import { domain } from "@ekairos/domain";
|
|
3
|
+
import { eventsDomain } from "@ekairos/events";
|
|
3
4
|
const entities = {
|
|
4
5
|
dataset_datasets: i.entity({
|
|
5
6
|
datasetId: i.string().unique().indexed(),
|
|
@@ -9,8 +10,6 @@ const entities = {
|
|
|
9
10
|
updatedAt: i.number().optional(),
|
|
10
11
|
organizationId: i.string().optional().indexed(),
|
|
11
12
|
title: i.string().optional(),
|
|
12
|
-
sources: i.json().optional(),
|
|
13
|
-
sourceKinds: i.json().optional(),
|
|
14
13
|
instructions: i.string().optional(),
|
|
15
14
|
analysis: i.json().optional(),
|
|
16
15
|
schema: i.json().optional(),
|
|
@@ -32,9 +31,13 @@ const links = {
|
|
|
32
31
|
forward: { on: "dataset_datasets", has: "one", label: "dataFile" },
|
|
33
32
|
reverse: { on: "$files", has: "many", label: "datasets" },
|
|
34
33
|
},
|
|
34
|
+
dataset_datasetsContext: {
|
|
35
|
+
forward: { on: "dataset_datasets", has: "one", label: "context" },
|
|
36
|
+
reverse: { on: "event_contexts", has: "many", label: "datasets" },
|
|
37
|
+
},
|
|
35
38
|
};
|
|
36
39
|
const rooms = {};
|
|
37
|
-
export const datasetDomain = domain("dataset").withSchema({
|
|
40
|
+
export const datasetDomain = domain("dataset").includes(eventsDomain).withSchema({
|
|
38
41
|
entities,
|
|
39
42
|
links,
|
|
40
43
|
rooms,
|
package/dist/service.d.ts
CHANGED
package/dist/service.js
CHANGED
|
@@ -28,18 +28,21 @@ export class DatasetService {
|
|
|
28
28
|
async createDataset(params) {
|
|
29
29
|
try {
|
|
30
30
|
const datasetId = params.id ?? createDatasetId();
|
|
31
|
+
const { id: _id, contextId, ...attrs } = params;
|
|
31
32
|
const existing = await this.resolveDatasetEntityId(datasetId);
|
|
32
33
|
const entityId = existing.ok ? existing.data : createDatasetId();
|
|
33
34
|
const mutations = [];
|
|
34
35
|
mutations.push(this.db.tx.dataset_datasets[entityId].update({
|
|
35
36
|
datasetId,
|
|
36
|
-
sources: params.sources ?? "",
|
|
37
37
|
instructions: params.instructions ?? "",
|
|
38
38
|
status: params.status ?? "created",
|
|
39
39
|
createdAt: Date.now(),
|
|
40
40
|
updatedAt: Date.now(),
|
|
41
|
-
...
|
|
41
|
+
...attrs,
|
|
42
42
|
}));
|
|
43
|
+
if (contextId) {
|
|
44
|
+
mutations.push(this.db.tx.dataset_datasets[entityId].link({ context: contextId }));
|
|
45
|
+
}
|
|
43
46
|
await this.db.transact(mutations);
|
|
44
47
|
return { ok: true, data: { datasetId } };
|
|
45
48
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export type
|
|
1
|
+
export type TransformInputPreviewContext = {
|
|
2
2
|
totalRows: number;
|
|
3
3
|
metadata?: {
|
|
4
4
|
description: string;
|
|
@@ -18,5 +18,5 @@ export type TransformSourcePreviewContext = {
|
|
|
18
18
|
interface PreviewOptions {
|
|
19
19
|
headLines?: number;
|
|
20
20
|
}
|
|
21
|
-
export declare function
|
|
21
|
+
export declare function generateInputPreview(runtime: any, sandboxId: string, inputPath: string, datasetId: string, options?: PreviewOptions): Promise<TransformInputPreviewContext>;
|
|
22
22
|
export {};
|
|
@@ -17,7 +17,7 @@ async function runPythonSnippet(runtime, sandboxId, datasetId, scriptName, code,
|
|
|
17
17
|
stderr,
|
|
18
18
|
};
|
|
19
19
|
}
|
|
20
|
-
export async function
|
|
20
|
+
export async function generateInputPreview(runtime, sandboxId, inputPath, datasetId, options = {}) {
|
|
21
21
|
const context = {
|
|
22
22
|
totalRows: 0,
|
|
23
23
|
};
|
|
@@ -41,7 +41,7 @@ try:
|
|
|
41
41
|
except Exception as e:
|
|
42
42
|
print(str(e))
|
|
43
43
|
`;
|
|
44
|
-
const meta = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_count", countScript, [
|
|
44
|
+
const meta = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_count", countScript, [inputPath], "Counts number of JSONL records with type='row'");
|
|
45
45
|
context.metadata = meta;
|
|
46
46
|
try {
|
|
47
47
|
if (meta.stdout) {
|
|
@@ -76,7 +76,7 @@ try:
|
|
|
76
76
|
except Exception as e:
|
|
77
77
|
print(str(e))
|
|
78
78
|
`;
|
|
79
|
-
const head = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_head", headScript, [
|
|
79
|
+
const head = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_head", headScript, [inputPath, String(headLines)], `Reads the first ${headLines} JSONL row records`);
|
|
80
80
|
context.head = head;
|
|
81
81
|
return context;
|
|
82
82
|
}
|
|
@@ -9,7 +9,7 @@ function buildRole() {
|
|
|
9
9
|
function buildGoal() {
|
|
10
10
|
let xml = create()
|
|
11
11
|
.ele("Goal")
|
|
12
|
-
.txt("Transform the
|
|
12
|
+
.txt("Transform the input dataset(s) (JSONL with {type:'row', data:{...}} per line) into a new dataset strictly matching the output schema. Save to output.jsonl in the dataset workstation. Each line must remain a single JSON object representing one record. You may need to combine, filter, or reshape data from multiple input datasets.")
|
|
13
13
|
.up();
|
|
14
14
|
return xml.end({ prettyPrint: true, headless: true });
|
|
15
15
|
}
|
|
@@ -17,26 +17,26 @@ function buildContextSection(context) {
|
|
|
17
17
|
let xml = create()
|
|
18
18
|
.ele("Context")
|
|
19
19
|
.ele("DatasetId").txt(context.datasetId).up();
|
|
20
|
-
let
|
|
21
|
-
for (const sourceId of context.
|
|
22
|
-
|
|
20
|
+
let inputsXml = create().ele("InputDatasets");
|
|
21
|
+
for (const sourceId of context.inputDatasetIds) {
|
|
22
|
+
inputsXml = inputsXml.ele("InputDatasetId").txt(sourceId).up();
|
|
23
23
|
}
|
|
24
|
-
xml = xml.import(
|
|
24
|
+
xml = xml.import(inputsXml.first());
|
|
25
25
|
let sandboxXml = create().ele("Sandbox");
|
|
26
|
-
for (const
|
|
27
|
-
sandboxXml = sandboxXml.ele("
|
|
28
|
-
.ele("DatasetId").txt(
|
|
29
|
-
.ele("Path").txt(
|
|
26
|
+
for (const inputPathInfo of context.sandboxConfig.inputPaths) {
|
|
27
|
+
sandboxXml = sandboxXml.ele("InputFile")
|
|
28
|
+
.ele("DatasetId").txt(inputPathInfo.datasetId).up()
|
|
29
|
+
.ele("Path").txt(inputPathInfo.path).up()
|
|
30
30
|
.up();
|
|
31
31
|
}
|
|
32
32
|
sandboxXml = sandboxXml.ele("OutputPath").txt(context.sandboxConfig.outputPath).up();
|
|
33
33
|
xml = xml.import(sandboxXml.first());
|
|
34
|
-
if (context.
|
|
35
|
-
let previewsXml = create().ele("
|
|
36
|
-
for (const
|
|
37
|
-
const sp =
|
|
38
|
-
let px = create().ele("
|
|
39
|
-
.ele("DatasetId").txt(
|
|
34
|
+
if (context.inputPreviews && context.inputPreviews.length > 0) {
|
|
35
|
+
let previewsXml = create().ele("InputPreviews");
|
|
36
|
+
for (const inputPreviewInfo of context.inputPreviews) {
|
|
37
|
+
const sp = inputPreviewInfo.preview;
|
|
38
|
+
let px = create().ele("InputPreview")
|
|
39
|
+
.ele("DatasetId").txt(inputPreviewInfo.datasetId).up()
|
|
40
40
|
.ele("TotalRows").txt(String(sp.totalRows)).up();
|
|
41
41
|
if (sp.metadata) {
|
|
42
42
|
const m = sp.metadata;
|
|
@@ -86,21 +86,21 @@ function buildOutputSchemaSection(context) {
|
|
|
86
86
|
}
|
|
87
87
|
function buildInstructions(context) {
|
|
88
88
|
const outputPath = context.sandboxConfig.outputPath;
|
|
89
|
-
const
|
|
90
|
-
? "You have multiple
|
|
89
|
+
const multipleInputsNote = context.inputDatasetIds.length > 1
|
|
90
|
+
? "You have multiple input datasets available. You may need to read, join, filter, or combine data from them to produce the output."
|
|
91
91
|
: "";
|
|
92
92
|
let xml = create()
|
|
93
93
|
.ele("Instructions")
|
|
94
94
|
.ele("Workflow")
|
|
95
|
-
.ele("Step", { number: "1", name: "Inspect
|
|
96
|
-
.ele("Action").txt(`Review
|
|
95
|
+
.ele("Step", { number: "1", name: "Inspect Inputs" })
|
|
96
|
+
.ele("Action").txt(`Review InputPreviews to understand current record structures (data fields, shapes, edge cases). ${multipleInputsNote}`).up()
|
|
97
97
|
.up()
|
|
98
98
|
.ele("Step", { number: "2", name: "Plan Mapping" })
|
|
99
|
-
.ele("Action").txt("Plan a deterministic mapping from
|
|
100
|
-
.ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple
|
|
99
|
+
.ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
|
|
100
|
+
.ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
|
|
101
101
|
.up()
|
|
102
102
|
.ele("Step", { number: "3", name: "Transform" })
|
|
103
|
-
.ele("Action").txt("Use executeCommand to run a Python script that reads
|
|
103
|
+
.ele("Action").txt("Use executeCommand to run a Python script that reads input JSONL file(s) and writes transformed records to output.jsonl. Keep line-per-record JSON objects with { 'type': 'row', 'data': { ... } }.").up()
|
|
104
104
|
.ele("Requirement").txt(`Write file to: ${outputPath}`).up()
|
|
105
105
|
.ele("Requirement").txt("Every data object MUST use the exact property names from OutputSchema required/properties keys. Do not translate, localize, rename, or infer alternative field names.").up()
|
|
106
106
|
.ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
|
|
@@ -112,12 +112,12 @@ function buildInstructions(context) {
|
|
|
112
112
|
.up()
|
|
113
113
|
.ele("Rules")
|
|
114
114
|
.ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
|
|
115
|
-
.ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve
|
|
115
|
+
.ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
|
|
116
116
|
.ele("Rule").txt("Each line in output.jsonl must be a standalone JSON object with {type:'row', data:{...}}.").up()
|
|
117
117
|
.ele("Rule").txt("Do not include headers, summaries, or metadata as records.").up()
|
|
118
|
-
.ele("Rule").txt("Be robust to malformed lines in
|
|
118
|
+
.ele("Rule").txt("Be robust to malformed lines in input: skip or sanitize, but do not crash.").up()
|
|
119
119
|
.up()
|
|
120
|
-
.ele("CurrentTask").txt("Transform
|
|
120
|
+
.ele("CurrentTask").txt("Transform input dataset(s) to match OutputSchema and write output.jsonl, then complete.").up()
|
|
121
121
|
.up();
|
|
122
122
|
return xml.end({ prettyPrint: true, headless: true });
|
|
123
123
|
}
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { type ContextReactor } from "@ekairos/events";
|
|
2
|
-
import type { TransformDatasetRunOptions, TransformSandboxState,
|
|
2
|
+
import type { TransformDatasetRunOptions, TransformSandboxState, TransformInputPreviewContext } from "./transform-dataset.types.js";
|
|
3
3
|
export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
|
|
4
4
|
export declare function createTransformDatasetContext<Env extends {
|
|
5
5
|
orgId: string;
|
|
6
6
|
}>(params: {
|
|
7
|
-
|
|
7
|
+
inputDatasetIds: string[];
|
|
8
8
|
outputSchema: any;
|
|
9
9
|
instructions?: string;
|
|
10
10
|
datasetId?: string;
|
|
@@ -12,9 +12,9 @@ export declare function createTransformDatasetContext<Env extends {
|
|
|
12
12
|
sandboxId?: string;
|
|
13
13
|
reactor?: ContextReactor<any, any>;
|
|
14
14
|
sandboxState?: TransformSandboxState;
|
|
15
|
-
|
|
15
|
+
inputPreviews?: Array<{
|
|
16
16
|
datasetId: string;
|
|
17
|
-
preview:
|
|
17
|
+
preview: TransformInputPreviewContext;
|
|
18
18
|
}>;
|
|
19
19
|
}): {
|
|
20
20
|
datasetId: string;
|
|
@@ -4,7 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
|
|
|
4
4
|
import { datasetUpdateSchemaStep } from "../dataset/steps.js";
|
|
5
5
|
import { getDatasetOutputPath } from "../datasetFiles.js";
|
|
6
6
|
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
7
|
-
import { buildTransformDatasetPromptStep,
|
|
7
|
+
import { buildTransformDatasetPromptStep, ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "./transform-dataset.steps.js";
|
|
8
8
|
import { createDatasetId } from "../id.js";
|
|
9
9
|
async function awaitContextRun(run) {
|
|
10
10
|
if (!run)
|
|
@@ -22,12 +22,12 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
22
22
|
.context(async (stored, _env, runtime) => {
|
|
23
23
|
const previous = stored?.content ?? {};
|
|
24
24
|
const sandboxState = previous?.sandboxState ??
|
|
25
|
-
params.sandboxState ?? { initialized: false,
|
|
25
|
+
params.sandboxState ?? { initialized: false, inputPaths: [] };
|
|
26
26
|
const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
|
|
27
|
-
const
|
|
28
|
-
? previous.
|
|
29
|
-
: Array.isArray(params.
|
|
30
|
-
? params.
|
|
27
|
+
const inputDatasetIds = Array.isArray(previous?.inputDatasetIds)
|
|
28
|
+
? previous.inputDatasetIds
|
|
29
|
+
: Array.isArray(params.inputDatasetIds)
|
|
30
|
+
? params.inputDatasetIds
|
|
31
31
|
: [];
|
|
32
32
|
const outputSchema = previous?.outputSchema ?? params.outputSchema;
|
|
33
33
|
const instructions = previous?.instructions ?? params.instructions;
|
|
@@ -35,8 +35,8 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
35
35
|
if (!datasetId) {
|
|
36
36
|
throw new Error("dataset_id_required");
|
|
37
37
|
}
|
|
38
|
-
if (
|
|
39
|
-
throw new Error("
|
|
38
|
+
if (inputDatasetIds.length === 0) {
|
|
39
|
+
throw new Error("dataset_transform_inputs_required");
|
|
40
40
|
}
|
|
41
41
|
if (!outputSchema) {
|
|
42
42
|
throw new Error("dataset_transform_schema_required");
|
|
@@ -44,26 +44,26 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
44
44
|
if (!sandboxId) {
|
|
45
45
|
throw new Error("dataset_sandbox_required");
|
|
46
46
|
}
|
|
47
|
-
const initialized = sandboxState.initialized && Array.isArray(sandboxState.
|
|
47
|
+
const initialized = sandboxState.initialized && Array.isArray(sandboxState.inputPaths)
|
|
48
48
|
? {
|
|
49
|
-
|
|
49
|
+
inputPaths: sandboxState.inputPaths,
|
|
50
50
|
outputPath: previous?.sandboxConfig?.outputPath ?? getDatasetOutputPath(datasetId),
|
|
51
51
|
state: sandboxState,
|
|
52
52
|
}
|
|
53
|
-
: await
|
|
53
|
+
: await ensureTransformInputsInSandboxStep({
|
|
54
54
|
runtime,
|
|
55
55
|
sandboxId,
|
|
56
56
|
datasetId,
|
|
57
|
-
|
|
57
|
+
inputDatasetIds,
|
|
58
58
|
state: sandboxState,
|
|
59
59
|
});
|
|
60
|
-
let
|
|
61
|
-
if (!
|
|
62
|
-
|
|
60
|
+
let inputPreviews = previous?.inputPreviews ?? params.inputPreviews ?? undefined;
|
|
61
|
+
if (!inputPreviews) {
|
|
62
|
+
inputPreviews = await generateTransformInputPreviewsStep({
|
|
63
63
|
runtime,
|
|
64
64
|
sandboxId,
|
|
65
65
|
datasetId,
|
|
66
|
-
|
|
66
|
+
inputPaths: initialized.inputPaths,
|
|
67
67
|
});
|
|
68
68
|
}
|
|
69
69
|
await datasetUpdateSchemaStep({
|
|
@@ -74,13 +74,13 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
74
74
|
});
|
|
75
75
|
const promptContext = {
|
|
76
76
|
datasetId,
|
|
77
|
-
|
|
77
|
+
inputDatasetIds,
|
|
78
78
|
outputSchema,
|
|
79
79
|
sandboxConfig: {
|
|
80
|
-
|
|
80
|
+
inputPaths: initialized.inputPaths,
|
|
81
81
|
outputPath: initialized.outputPath,
|
|
82
82
|
},
|
|
83
|
-
|
|
83
|
+
inputPreviews: inputPreviews.length > 0 ? inputPreviews : undefined,
|
|
84
84
|
errors: [],
|
|
85
85
|
};
|
|
86
86
|
const basePrompt = await buildTransformDatasetPromptStep({
|
|
@@ -100,14 +100,14 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
100
100
|
return {
|
|
101
101
|
...previous,
|
|
102
102
|
datasetId,
|
|
103
|
-
|
|
103
|
+
inputDatasetIds,
|
|
104
104
|
outputSchema,
|
|
105
105
|
instructions,
|
|
106
106
|
sandboxId,
|
|
107
107
|
sandboxState: initialized.state,
|
|
108
108
|
system,
|
|
109
109
|
sandboxConfig: {
|
|
110
|
-
|
|
110
|
+
inputPaths: initialized.inputPaths,
|
|
111
111
|
outputPath: initialized.outputPath,
|
|
112
112
|
},
|
|
113
113
|
};
|
|
@@ -159,7 +159,7 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
159
159
|
export function createTransformDatasetContext(params) {
|
|
160
160
|
const datasetId = params.datasetId ?? createDatasetId();
|
|
161
161
|
const { context } = createTransformDatasetContextDefinition({
|
|
162
|
-
|
|
162
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
163
163
|
outputSchema: params.outputSchema,
|
|
164
164
|
instructions: params.instructions,
|
|
165
165
|
datasetId,
|
|
@@ -167,14 +167,14 @@ export function createTransformDatasetContext(params) {
|
|
|
167
167
|
sandboxId: params.sandboxId,
|
|
168
168
|
reactor: params.reactor,
|
|
169
169
|
sandboxState: params.sandboxState,
|
|
170
|
-
|
|
170
|
+
inputPreviews: params.inputPreviews,
|
|
171
171
|
});
|
|
172
172
|
return {
|
|
173
173
|
datasetId,
|
|
174
174
|
async transform(runtime, options = {}) {
|
|
175
|
-
const datasetCountText = params.
|
|
176
|
-
? "the
|
|
177
|
-
: `${params.
|
|
175
|
+
const datasetCountText = params.inputDatasetIds.length === 1
|
|
176
|
+
? "the input dataset"
|
|
177
|
+
: `${params.inputDatasetIds.length} input datasets`;
|
|
178
178
|
const triggerEvent = {
|
|
179
179
|
id: createDatasetId(),
|
|
180
180
|
type: INPUT_TEXT_ITEM_TYPE,
|
|
@@ -195,7 +195,6 @@ export function createTransformDatasetContext(params) {
|
|
|
195
195
|
context: { key: `dataset:${datasetId}` },
|
|
196
196
|
durable: options.durable ?? false,
|
|
197
197
|
options: {
|
|
198
|
-
silent: true,
|
|
199
198
|
preventClose: true,
|
|
200
199
|
sendFinish: false,
|
|
201
200
|
maxIterations: 20,
|
|
@@ -204,12 +203,12 @@ export function createTransformDatasetContext(params) {
|
|
|
204
203
|
__initialContent: {
|
|
205
204
|
...(options.initialContent ?? {}),
|
|
206
205
|
datasetId,
|
|
207
|
-
|
|
206
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
208
207
|
outputSchema: params.outputSchema,
|
|
209
208
|
instructions: params.instructions,
|
|
210
209
|
sandboxId: params.sandboxId ?? "",
|
|
211
|
-
sandboxState: params.sandboxState ?? { initialized: false,
|
|
212
|
-
|
|
210
|
+
sandboxState: params.sandboxState ?? { initialized: false, inputPaths: [] },
|
|
211
|
+
inputPreviews: params.inputPreviews,
|
|
213
212
|
},
|
|
214
213
|
});
|
|
215
214
|
await awaitContextRun(shell.run);
|
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
import type { TransformPromptContext, TransformSandboxState,
|
|
2
|
-
export declare function
|
|
1
|
+
import type { TransformPromptContext, TransformSandboxState, TransformInputPreviewContext } from "./transform-dataset.types.js";
|
|
2
|
+
export declare function ensureTransformInputsInSandboxStep(params: {
|
|
3
3
|
runtime: any;
|
|
4
4
|
sandboxId: string;
|
|
5
5
|
datasetId: string;
|
|
6
|
-
|
|
6
|
+
inputDatasetIds: string[];
|
|
7
7
|
state: TransformSandboxState;
|
|
8
8
|
}): Promise<{
|
|
9
|
-
|
|
9
|
+
inputPaths: Array<{
|
|
10
10
|
datasetId: string;
|
|
11
11
|
path: string;
|
|
12
12
|
}>;
|
|
13
13
|
outputPath: string;
|
|
14
14
|
state: TransformSandboxState;
|
|
15
15
|
}>;
|
|
16
|
-
export declare function
|
|
16
|
+
export declare function generateTransformInputPreviewsStep(params: {
|
|
17
17
|
runtime: any;
|
|
18
18
|
sandboxId: string;
|
|
19
19
|
datasetId: string;
|
|
20
|
-
|
|
20
|
+
inputPaths: Array<{
|
|
21
21
|
datasetId: string;
|
|
22
22
|
path: string;
|
|
23
23
|
}>;
|
|
24
24
|
}): Promise<Array<{
|
|
25
25
|
datasetId: string;
|
|
26
|
-
preview:
|
|
26
|
+
preview: TransformInputPreviewContext;
|
|
27
27
|
}>>;
|
|
28
28
|
export declare function buildTransformDatasetPromptStep(params: {
|
|
29
29
|
context: TransformPromptContext;
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import { getDatasetOutputPath,
|
|
1
|
+
import { getDatasetOutputPath, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
2
2
|
import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
|
|
3
3
|
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
4
|
-
import {
|
|
4
|
+
import { generateInputPreview } from "./filepreview.js";
|
|
5
5
|
import { buildTransformDatasetPrompt } from "./prompts.js";
|
|
6
|
-
export async function
|
|
6
|
+
export async function ensureTransformInputsInSandboxStep(params) {
|
|
7
7
|
"use step";
|
|
8
8
|
if (params.state.initialized) {
|
|
9
9
|
return {
|
|
10
|
-
|
|
10
|
+
inputPaths: params.state.inputPaths,
|
|
11
11
|
outputPath: getDatasetOutputPath(params.datasetId),
|
|
12
12
|
state: params.state,
|
|
13
13
|
};
|
|
@@ -18,42 +18,42 @@ export async function ensureTransformSourcesInSandboxStep(params) {
|
|
|
18
18
|
cmd: "mkdir",
|
|
19
19
|
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
20
20
|
});
|
|
21
|
-
const
|
|
22
|
-
for (const
|
|
23
|
-
const
|
|
24
|
-
const
|
|
21
|
+
const inputPaths = [];
|
|
22
|
+
for (const inputDatasetId of params.inputDatasetIds) {
|
|
23
|
+
const inputPath = `${getDatasetResourcesDir(params.datasetId)}/resource_${inputDatasetId}.jsonl`;
|
|
24
|
+
const input = await datasetReadOutputJsonlStep({
|
|
25
25
|
runtime: params.runtime,
|
|
26
|
-
datasetId:
|
|
26
|
+
datasetId: inputDatasetId,
|
|
27
27
|
});
|
|
28
28
|
await writeDatasetSandboxFilesStep({
|
|
29
29
|
runtime: params.runtime,
|
|
30
30
|
sandboxId: params.sandboxId,
|
|
31
|
-
files: [{ path:
|
|
31
|
+
files: [{ path: inputPath, contentBase64: input.contentBase64 }],
|
|
32
32
|
});
|
|
33
|
-
|
|
33
|
+
inputPaths.push({ datasetId: inputDatasetId, path: inputPath });
|
|
34
34
|
}
|
|
35
35
|
return {
|
|
36
|
-
|
|
36
|
+
inputPaths,
|
|
37
37
|
outputPath: getDatasetOutputPath(params.datasetId),
|
|
38
38
|
state: {
|
|
39
39
|
initialized: true,
|
|
40
|
-
|
|
40
|
+
inputPaths,
|
|
41
41
|
},
|
|
42
42
|
};
|
|
43
43
|
}
|
|
44
|
-
export async function
|
|
44
|
+
export async function generateTransformInputPreviewsStep(params) {
|
|
45
45
|
"use step";
|
|
46
|
-
const
|
|
47
|
-
for (const
|
|
46
|
+
const inputPreviews = [];
|
|
47
|
+
for (const inputPath of params.inputPaths) {
|
|
48
48
|
try {
|
|
49
|
-
const preview = await
|
|
50
|
-
|
|
49
|
+
const preview = await generateInputPreview(params.runtime, params.sandboxId, inputPath.path, params.datasetId);
|
|
50
|
+
inputPreviews.push({ datasetId: inputPath.datasetId, preview });
|
|
51
51
|
}
|
|
52
52
|
catch {
|
|
53
|
-
//
|
|
53
|
+
// Input preview is optional; transformation can still read the JSONL files.
|
|
54
54
|
}
|
|
55
55
|
}
|
|
56
|
-
return
|
|
56
|
+
return inputPreviews;
|
|
57
57
|
}
|
|
58
58
|
export async function buildTransformDatasetPromptStep(params) {
|
|
59
59
|
"use step";
|
|
@@ -1,34 +1,34 @@
|
|
|
1
1
|
import type { ContextReactor } from "@ekairos/events";
|
|
2
|
-
import type {
|
|
3
|
-
export type {
|
|
2
|
+
import type { TransformInputPreviewContext } from "./filepreview.js";
|
|
3
|
+
export type { TransformInputPreviewContext } from "./filepreview.js";
|
|
4
4
|
export type TransformSandboxState = {
|
|
5
5
|
initialized: boolean;
|
|
6
|
-
|
|
6
|
+
inputPaths: Array<{
|
|
7
7
|
datasetId: string;
|
|
8
8
|
path: string;
|
|
9
9
|
}>;
|
|
10
10
|
};
|
|
11
11
|
export type TransformDatasetContext = {
|
|
12
12
|
datasetId: string;
|
|
13
|
-
|
|
13
|
+
inputDatasetIds: string[];
|
|
14
14
|
outputSchema: any;
|
|
15
15
|
sandboxConfig: {
|
|
16
|
-
|
|
16
|
+
inputPaths: Array<{
|
|
17
17
|
datasetId: string;
|
|
18
18
|
path: string;
|
|
19
19
|
}>;
|
|
20
20
|
outputPath: string;
|
|
21
21
|
};
|
|
22
|
-
|
|
22
|
+
inputPreviews?: Array<{
|
|
23
23
|
datasetId: string;
|
|
24
|
-
preview:
|
|
24
|
+
preview: TransformInputPreviewContext;
|
|
25
25
|
}>;
|
|
26
26
|
errors: string[];
|
|
27
27
|
iterationCount: number;
|
|
28
28
|
instructions?: string;
|
|
29
29
|
};
|
|
30
30
|
export type TransformDatasetAgentParams = {
|
|
31
|
-
|
|
31
|
+
inputDatasetIds?: string[];
|
|
32
32
|
outputSchema?: any;
|
|
33
33
|
instructions?: string;
|
|
34
34
|
datasetId?: string;
|
|
@@ -36,9 +36,9 @@ export type TransformDatasetAgentParams = {
|
|
|
36
36
|
sandboxId?: string;
|
|
37
37
|
reactor?: ContextReactor<any, any>;
|
|
38
38
|
sandboxState?: TransformSandboxState;
|
|
39
|
-
|
|
39
|
+
inputPreviews?: Array<{
|
|
40
40
|
datasetId: string;
|
|
41
|
-
preview:
|
|
41
|
+
preview: TransformInputPreviewContext;
|
|
42
42
|
}>;
|
|
43
43
|
};
|
|
44
44
|
export type TransformDatasetRunOptions = {
|
|
@@ -59,16 +59,16 @@ export type TransformDatasetResult = {
|
|
|
59
59
|
};
|
|
60
60
|
export type TransformPromptContext = {
|
|
61
61
|
datasetId: string;
|
|
62
|
-
|
|
62
|
+
inputDatasetIds: string[];
|
|
63
63
|
outputSchema: any;
|
|
64
64
|
sandboxConfig: {
|
|
65
|
-
|
|
65
|
+
inputPaths: Array<{
|
|
66
66
|
datasetId: string;
|
|
67
67
|
path: string;
|
|
68
68
|
}>;
|
|
69
69
|
outputPath: string;
|
|
70
70
|
};
|
|
71
|
-
|
|
71
|
+
inputPreviews?: Array<{
|
|
72
72
|
datasetId: string;
|
|
73
73
|
preview: {
|
|
74
74
|
totalRows: number;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { datasetPreviewRowsStep } from "../dataset/steps.js";
|
|
2
2
|
import { createTransformDatasetContext } from "./transform-dataset.agent.js";
|
|
3
3
|
function buildInstructions(input) {
|
|
4
|
-
const
|
|
4
|
+
const inputs = input.datasets
|
|
5
5
|
.map((d, idx) => {
|
|
6
6
|
const name = d.description ? ` - ${d.description}` : "";
|
|
7
7
|
return `${idx + 1}. ${d.id}${name}`;
|
|
@@ -12,8 +12,8 @@ function buildInstructions(input) {
|
|
|
12
12
|
"Use pandas when helpful. Output must be JSONL with {type:'row', data:{...}} lines.",
|
|
13
13
|
"Respect the provided output schema exactly.",
|
|
14
14
|
"",
|
|
15
|
-
"##
|
|
16
|
-
|
|
15
|
+
"## Input Datasets",
|
|
16
|
+
inputs || "- (none)",
|
|
17
17
|
"",
|
|
18
18
|
"## Transformation Description (LaTeX + sets)",
|
|
19
19
|
String(input.description ?? "").trim(),
|
|
@@ -25,7 +25,7 @@ function buildInstructions(input) {
|
|
|
25
25
|
*/
|
|
26
26
|
export async function transformDataset(runtime, input) {
|
|
27
27
|
const transformContext = createTransformDatasetContext({
|
|
28
|
-
|
|
28
|
+
inputDatasetIds: input.datasets.map((d) => d.id),
|
|
29
29
|
outputSchema: input.outputSchema,
|
|
30
30
|
instructions: buildInstructions(input),
|
|
31
31
|
datasetId: input.datasetId,
|