@ekairos/dataset 1.22.82-beta.development.0 → 1.22.84-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/agentMaterializers.d.ts +2 -2
- package/dist/builder/context.d.ts +7 -0
- package/dist/builder/context.js +192 -0
- package/dist/builder/instructions.d.ts +3 -3
- package/dist/builder/instructions.js +10 -10
- package/dist/builder/materialize.d.ts +12 -11
- package/dist/builder/materialize.js +122 -121
- package/dist/builder/materializeQuery.d.ts +3 -2
- package/dist/builder/materializeQuery.js +10 -19
- package/dist/builder/persistence.d.ts +4 -5
- package/dist/builder/persistence.js +20 -19
- package/dist/builder/types.d.ts +31 -24
- package/dist/completeDataset.steps.d.ts +9 -8
- package/dist/completeDataset.steps.js +18 -11
- package/dist/completeDataset.tool.d.ts +9 -8
- package/dist/completeDataset.tool.js +2 -1
- package/dist/contextWorkspace.d.ts +72 -0
- package/dist/contextWorkspace.js +218 -0
- package/dist/dataset.d.ts +1 -1
- package/dist/dataset.js +42 -29
- package/dist/datasetFiles.d.ts +1 -1
- package/dist/datasetFiles.js +3 -3
- package/dist/executeCommand.tool.d.ts +1 -43
- package/dist/executeCommand.tool.js +10 -3
- package/dist/file/file-dataset.agent.d.ts +2 -0
- package/dist/file/file-dataset.agent.js +51 -16
- package/dist/file/file-dataset.steps.d.ts +6 -0
- package/dist/file/file-dataset.steps.js +18 -21
- package/dist/file/file-dataset.types.d.ts +10 -0
- package/dist/file/prompts.js +16 -14
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/materializeDataset.tool.d.ts +34 -26
- package/dist/materializeDataset.tool.js +40 -29
- package/dist/schema.d.ts +12 -2
- package/dist/schema.js +6 -3
- package/dist/service.d.ts +2 -2
- package/dist/service.js +6 -3
- package/dist/transform/filepreview.d.ts +2 -2
- package/dist/transform/filepreview.js +3 -3
- package/dist/transform/prompts.js +25 -25
- package/dist/transform/transform-dataset.agent.d.ts +4 -4
- package/dist/transform/transform-dataset.agent.js +29 -30
- package/dist/transform/transform-dataset.steps.d.ts +7 -7
- package/dist/transform/transform-dataset.steps.js +20 -20
- package/dist/transform/transform-dataset.types.d.ts +13 -13
- package/dist/transform/transformDataset.js +4 -4
- package/package.json +4 -4
- /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
- /package/dist/builder/{sourceRows.js → rows.js} +0 -0
|
@@ -7,26 +7,30 @@ declare const materializeDatasetToolInputSchema: z.ZodObject<{
|
|
|
7
7
|
datasetId: z.ZodOptional<z.ZodString>;
|
|
8
8
|
sandboxId: z.ZodOptional<z.ZodString>;
|
|
9
9
|
title: z.ZodOptional<z.ZodString>;
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
fileId: z.ZodString;
|
|
13
|
-
description: z.ZodOptional<z.ZodString>;
|
|
10
|
+
context: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
|
|
11
|
+
id: z.ZodString;
|
|
14
12
|
}, z.core.$strip>, z.ZodObject<{
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
key: z.ZodString;
|
|
14
|
+
}, z.core.$strip>]>>;
|
|
15
|
+
files: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
16
|
+
description: z.ZodOptional<z.ZodString>;
|
|
17
|
+
fileId: z.ZodString;
|
|
18
|
+
}, z.core.$strip>>>;
|
|
19
|
+
texts: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
18
20
|
name: z.ZodOptional<z.ZodString>;
|
|
21
|
+
text: z.ZodString;
|
|
19
22
|
description: z.ZodOptional<z.ZodString>;
|
|
20
|
-
|
|
21
|
-
|
|
23
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
24
|
+
}, z.core.$strip>>>;
|
|
25
|
+
datasets: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
22
26
|
datasetId: z.ZodString;
|
|
23
27
|
description: z.ZodOptional<z.ZodString>;
|
|
24
|
-
}, z.core.$strip
|
|
25
|
-
|
|
26
|
-
query: z.ZodRecord<z.ZodString, z.ZodAny>;
|
|
28
|
+
}, z.core.$strip>>>;
|
|
29
|
+
queries: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
27
30
|
title: z.ZodOptional<z.ZodString>;
|
|
31
|
+
query: z.ZodRecord<z.ZodString, z.ZodAny>;
|
|
28
32
|
explanation: z.ZodOptional<z.ZodString>;
|
|
29
|
-
}, z.core.$strip
|
|
33
|
+
}, z.core.$strip>>>;
|
|
30
34
|
instructions: z.ZodOptional<z.ZodString>;
|
|
31
35
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
32
36
|
schema: "schema";
|
|
@@ -52,29 +56,33 @@ export declare function createMaterializeDatasetTool<Runtime extends AnyMaterial
|
|
|
52
56
|
queryDomain: QueryDomain & CompatibleToolQueryDomain<Runtime, QueryDomain>;
|
|
53
57
|
toolName?: string;
|
|
54
58
|
}): import("ai").Tool<{
|
|
55
|
-
|
|
56
|
-
|
|
59
|
+
datasetId?: string | undefined;
|
|
60
|
+
sandboxId?: string | undefined;
|
|
61
|
+
title?: string | undefined;
|
|
62
|
+
context?: {
|
|
63
|
+
id: string;
|
|
64
|
+
} | {
|
|
65
|
+
key: string;
|
|
66
|
+
} | undefined;
|
|
67
|
+
files?: {
|
|
57
68
|
fileId: string;
|
|
58
69
|
description?: string | undefined;
|
|
59
|
-
} |
|
|
60
|
-
|
|
70
|
+
}[] | undefined;
|
|
71
|
+
texts?: {
|
|
61
72
|
text: string;
|
|
62
|
-
mimeType?: string | undefined;
|
|
63
73
|
name?: string | undefined;
|
|
64
74
|
description?: string | undefined;
|
|
65
|
-
|
|
66
|
-
|
|
75
|
+
mimeType?: string | undefined;
|
|
76
|
+
}[] | undefined;
|
|
77
|
+
datasets?: {
|
|
67
78
|
datasetId: string;
|
|
68
79
|
description?: string | undefined;
|
|
69
|
-
} |
|
|
70
|
-
|
|
80
|
+
}[] | undefined;
|
|
81
|
+
queries?: {
|
|
71
82
|
query: Record<string, any>;
|
|
72
83
|
title?: string | undefined;
|
|
73
84
|
explanation?: string | undefined;
|
|
74
|
-
}
|
|
75
|
-
datasetId?: string | undefined;
|
|
76
|
-
sandboxId?: string | undefined;
|
|
77
|
-
title?: string | undefined;
|
|
85
|
+
}[] | undefined;
|
|
78
86
|
instructions?: string | undefined;
|
|
79
87
|
mode?: "schema" | "auto" | undefined;
|
|
80
88
|
output?: "object" | "rows" | undefined;
|
|
@@ -1,29 +1,33 @@
|
|
|
1
1
|
import { tool } from "ai";
|
|
2
2
|
import { z } from "zod";
|
|
3
3
|
import { dataset } from "./dataset.js";
|
|
4
|
-
const
|
|
4
|
+
const fileResourceSchema = z.object({
|
|
5
5
|
kind: z.literal("file"),
|
|
6
6
|
fileId: z.string(),
|
|
7
7
|
description: z.string().optional(),
|
|
8
8
|
});
|
|
9
|
-
const
|
|
9
|
+
const textResourceSchema = z.object({
|
|
10
10
|
kind: z.literal("text"),
|
|
11
11
|
text: z.string(),
|
|
12
12
|
mimeType: z.string().optional(),
|
|
13
13
|
name: z.string().optional(),
|
|
14
14
|
description: z.string().optional(),
|
|
15
15
|
});
|
|
16
|
-
const
|
|
16
|
+
const datasetResourceSchema = z.object({
|
|
17
17
|
kind: z.literal("dataset"),
|
|
18
18
|
datasetId: z.string(),
|
|
19
19
|
description: z.string().optional(),
|
|
20
20
|
});
|
|
21
|
-
const
|
|
21
|
+
const queryResourceSchema = z.object({
|
|
22
22
|
kind: z.literal("query"),
|
|
23
23
|
query: z.record(z.string(), z.any()),
|
|
24
24
|
title: z.string().optional(),
|
|
25
25
|
explanation: z.string().optional(),
|
|
26
26
|
});
|
|
27
|
+
const contextInputSchema = z.union([
|
|
28
|
+
z.object({ id: z.string() }),
|
|
29
|
+
z.object({ key: z.string() }),
|
|
30
|
+
]);
|
|
27
31
|
const datasetSchemaSchema = z.object({
|
|
28
32
|
title: z.string().optional(),
|
|
29
33
|
description: z.string().optional(),
|
|
@@ -33,14 +37,11 @@ const materializeDatasetToolInputSchema = z.object({
|
|
|
33
37
|
datasetId: z.string().optional(),
|
|
34
38
|
sandboxId: z.string().optional(),
|
|
35
39
|
title: z.string().optional(),
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
querySourceSchema,
|
|
42
|
-
]))
|
|
43
|
-
.min(1),
|
|
40
|
+
context: contextInputSchema.optional(),
|
|
41
|
+
files: z.array(fileResourceSchema.omit({ kind: true })).optional(),
|
|
42
|
+
texts: z.array(textResourceSchema.omit({ kind: true })).optional(),
|
|
43
|
+
datasets: z.array(datasetResourceSchema.omit({ kind: true })).optional(),
|
|
44
|
+
queries: z.array(queryResourceSchema.omit({ kind: true })).optional(),
|
|
44
45
|
instructions: z.string().optional(),
|
|
45
46
|
mode: z.enum(["auto", "schema"]).optional(),
|
|
46
47
|
output: z.enum(["rows", "object"]).optional(),
|
|
@@ -49,7 +50,7 @@ const materializeDatasetToolInputSchema = z.object({
|
|
|
49
50
|
});
|
|
50
51
|
export function createMaterializeDatasetTool(params) {
|
|
51
52
|
return tool({
|
|
52
|
-
description: "Materialize a dataset from declarative
|
|
53
|
+
description: "Materialize a dataset from declarative resources. Returns only the target datasetId. Query resources use the preconfigured runtime domain.",
|
|
53
54
|
inputSchema: materializeDatasetToolInputSchema,
|
|
54
55
|
execute: async (input) => {
|
|
55
56
|
let builder = dataset(params.runtime);
|
|
@@ -59,23 +60,33 @@ export function createMaterializeDatasetTool(params) {
|
|
|
59
60
|
if (input.sandboxId?.trim()) {
|
|
60
61
|
builder = builder.sandbox({ sandboxId: input.sandboxId });
|
|
61
62
|
}
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
63
|
+
const materialCount = (input.files?.length ?? 0) +
|
|
64
|
+
(input.texts?.length ?? 0) +
|
|
65
|
+
(input.datasets?.length ?? 0) +
|
|
66
|
+
(input.queries?.length ?? 0);
|
|
67
|
+
if (input.context && materialCount > 0) {
|
|
68
|
+
throw new Error("dataset_context_resource_is_exclusive");
|
|
69
|
+
}
|
|
70
|
+
if (!input.context && materialCount === 0) {
|
|
71
|
+
throw new Error("dataset_context_or_material_required");
|
|
72
|
+
}
|
|
73
|
+
if (input.context) {
|
|
74
|
+
builder = builder.fromContext(input.context);
|
|
75
|
+
}
|
|
76
|
+
for (const resource of input.files ?? []) {
|
|
77
|
+
builder = builder.fromFile(resource);
|
|
78
|
+
}
|
|
79
|
+
for (const resource of input.texts ?? []) {
|
|
80
|
+
builder = builder.fromText(resource);
|
|
81
|
+
}
|
|
82
|
+
for (const resource of input.datasets ?? []) {
|
|
83
|
+
builder = builder.fromDataset(resource);
|
|
84
|
+
}
|
|
85
|
+
for (const resource of input.queries ?? []) {
|
|
75
86
|
builder = builder.fromQuery(params.queryDomain, {
|
|
76
|
-
query:
|
|
77
|
-
title:
|
|
78
|
-
explanation:
|
|
87
|
+
query: resource.query,
|
|
88
|
+
title: resource.title,
|
|
89
|
+
explanation: resource.explanation,
|
|
79
90
|
});
|
|
80
91
|
}
|
|
81
92
|
if (input.output === "object") {
|
package/dist/schema.d.ts
CHANGED
|
@@ -8,8 +8,6 @@ declare const entities: {
|
|
|
8
8
|
updatedAt: import("@instantdb/core").DataAttrDef<number, false, false, false>;
|
|
9
9
|
organizationId: import("@instantdb/core").DataAttrDef<string, false, true, false>;
|
|
10
10
|
title: import("@instantdb/core").DataAttrDef<string, false, false, false>;
|
|
11
|
-
sources: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
12
|
-
sourceKinds: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
13
11
|
instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
|
|
14
12
|
analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
15
13
|
schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
|
|
@@ -47,6 +45,18 @@ declare const links: {
|
|
|
47
45
|
readonly label: "datasets";
|
|
48
46
|
};
|
|
49
47
|
};
|
|
48
|
+
readonly dataset_datasetsContext: {
|
|
49
|
+
readonly forward: {
|
|
50
|
+
readonly on: "dataset_datasets";
|
|
51
|
+
readonly has: "one";
|
|
52
|
+
readonly label: "context";
|
|
53
|
+
};
|
|
54
|
+
readonly reverse: {
|
|
55
|
+
readonly on: "event_contexts";
|
|
56
|
+
readonly has: "many";
|
|
57
|
+
readonly label: "datasets";
|
|
58
|
+
};
|
|
59
|
+
};
|
|
50
60
|
};
|
|
51
61
|
declare const rooms: {};
|
|
52
62
|
export declare const datasetDomain: DomainSchemaResult<typeof entities, typeof links, typeof rooms, {}, "dataset", "dataset">;
|
package/dist/schema.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { i } from "@instantdb/core";
|
|
2
2
|
import { domain } from "@ekairos/domain";
|
|
3
|
+
import { eventsDomain } from "@ekairos/events";
|
|
3
4
|
const entities = {
|
|
4
5
|
dataset_datasets: i.entity({
|
|
5
6
|
datasetId: i.string().unique().indexed(),
|
|
@@ -9,8 +10,6 @@ const entities = {
|
|
|
9
10
|
updatedAt: i.number().optional(),
|
|
10
11
|
organizationId: i.string().optional().indexed(),
|
|
11
12
|
title: i.string().optional(),
|
|
12
|
-
sources: i.json().optional(),
|
|
13
|
-
sourceKinds: i.json().optional(),
|
|
14
13
|
instructions: i.string().optional(),
|
|
15
14
|
analysis: i.json().optional(),
|
|
16
15
|
schema: i.json().optional(),
|
|
@@ -32,9 +31,13 @@ const links = {
|
|
|
32
31
|
forward: { on: "dataset_datasets", has: "one", label: "dataFile" },
|
|
33
32
|
reverse: { on: "$files", has: "many", label: "datasets" },
|
|
34
33
|
},
|
|
34
|
+
dataset_datasetsContext: {
|
|
35
|
+
forward: { on: "dataset_datasets", has: "one", label: "context" },
|
|
36
|
+
reverse: { on: "event_contexts", has: "many", label: "datasets" },
|
|
37
|
+
},
|
|
35
38
|
};
|
|
36
39
|
const rooms = {};
|
|
37
|
-
export const datasetDomain = domain("dataset").withSchema({
|
|
40
|
+
export const datasetDomain = domain("dataset").includes(eventsDomain).withSchema({
|
|
38
41
|
entities,
|
|
39
42
|
links,
|
|
40
43
|
rooms,
|
package/dist/service.d.ts
CHANGED
|
@@ -15,8 +15,7 @@ export declare class DatasetService {
|
|
|
15
15
|
private resolveDatasetEntityId;
|
|
16
16
|
createDataset(params: {
|
|
17
17
|
id?: string;
|
|
18
|
-
|
|
19
|
-
sourceKinds?: any;
|
|
18
|
+
contextId?: string;
|
|
20
19
|
instructions?: string;
|
|
21
20
|
status?: string;
|
|
22
21
|
organizationId?: string;
|
|
@@ -64,6 +63,7 @@ export declare class DatasetService {
|
|
|
64
63
|
uploadDatasetOutputFile(params: {
|
|
65
64
|
datasetId: string;
|
|
66
65
|
fileBuffer: Buffer;
|
|
66
|
+
storagePath?: string;
|
|
67
67
|
}): Promise<ServiceResult<{
|
|
68
68
|
fileId: string;
|
|
69
69
|
storagePath: string;
|
package/dist/service.js
CHANGED
|
@@ -28,18 +28,21 @@ export class DatasetService {
|
|
|
28
28
|
async createDataset(params) {
|
|
29
29
|
try {
|
|
30
30
|
const datasetId = params.id ?? createDatasetId();
|
|
31
|
+
const { id: _id, contextId, ...attrs } = params;
|
|
31
32
|
const existing = await this.resolveDatasetEntityId(datasetId);
|
|
32
33
|
const entityId = existing.ok ? existing.data : createDatasetId();
|
|
33
34
|
const mutations = [];
|
|
34
35
|
mutations.push(this.db.tx.dataset_datasets[entityId].update({
|
|
35
36
|
datasetId,
|
|
36
|
-
sources: params.sources ?? "",
|
|
37
37
|
instructions: params.instructions ?? "",
|
|
38
38
|
status: params.status ?? "created",
|
|
39
39
|
createdAt: Date.now(),
|
|
40
40
|
updatedAt: Date.now(),
|
|
41
|
-
...
|
|
41
|
+
...attrs,
|
|
42
42
|
}));
|
|
43
|
+
if (contextId) {
|
|
44
|
+
mutations.push(this.db.tx.dataset_datasets[entityId].link({ context: contextId }));
|
|
45
|
+
}
|
|
43
46
|
await this.db.transact(mutations);
|
|
44
47
|
return { ok: true, data: { datasetId } };
|
|
45
48
|
}
|
|
@@ -308,7 +311,7 @@ export class DatasetService {
|
|
|
308
311
|
}
|
|
309
312
|
async uploadDatasetOutputFile(params) {
|
|
310
313
|
try {
|
|
311
|
-
const storagePath = `/dataset/${params.datasetId}/output.jsonl`;
|
|
314
|
+
const storagePath = params.storagePath ?? `/dataset/${params.datasetId}/output.jsonl`;
|
|
312
315
|
const uploadResult = await this.db.storage.uploadFile(storagePath, params.fileBuffer, {
|
|
313
316
|
contentType: "application/x-ndjson",
|
|
314
317
|
contentDisposition: "output.jsonl",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export type
|
|
1
|
+
export type TransformInputPreviewContext = {
|
|
2
2
|
totalRows: number;
|
|
3
3
|
metadata?: {
|
|
4
4
|
description: string;
|
|
@@ -18,5 +18,5 @@ export type TransformSourcePreviewContext = {
|
|
|
18
18
|
interface PreviewOptions {
|
|
19
19
|
headLines?: number;
|
|
20
20
|
}
|
|
21
|
-
export declare function
|
|
21
|
+
export declare function generateInputPreview(runtime: any, sandboxId: string, inputPath: string, datasetId: string, options?: PreviewOptions): Promise<TransformInputPreviewContext>;
|
|
22
22
|
export {};
|
|
@@ -17,7 +17,7 @@ async function runPythonSnippet(runtime, sandboxId, datasetId, scriptName, code,
|
|
|
17
17
|
stderr,
|
|
18
18
|
};
|
|
19
19
|
}
|
|
20
|
-
export async function
|
|
20
|
+
export async function generateInputPreview(runtime, sandboxId, inputPath, datasetId, options = {}) {
|
|
21
21
|
const context = {
|
|
22
22
|
totalRows: 0,
|
|
23
23
|
};
|
|
@@ -41,7 +41,7 @@ try:
|
|
|
41
41
|
except Exception as e:
|
|
42
42
|
print(str(e))
|
|
43
43
|
`;
|
|
44
|
-
const meta = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_count", countScript, [
|
|
44
|
+
const meta = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_count", countScript, [inputPath], "Counts number of JSONL records with type='row'");
|
|
45
45
|
context.metadata = meta;
|
|
46
46
|
try {
|
|
47
47
|
if (meta.stdout) {
|
|
@@ -76,7 +76,7 @@ try:
|
|
|
76
76
|
except Exception as e:
|
|
77
77
|
print(str(e))
|
|
78
78
|
`;
|
|
79
|
-
const head = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_head", headScript, [
|
|
79
|
+
const head = await runPythonSnippet(runtime, sandboxId, datasetId, "jsonl_head", headScript, [inputPath, String(headLines)], `Reads the first ${headLines} JSONL row records`);
|
|
80
80
|
context.head = head;
|
|
81
81
|
return context;
|
|
82
82
|
}
|
|
@@ -9,7 +9,7 @@ function buildRole() {
|
|
|
9
9
|
function buildGoal() {
|
|
10
10
|
let xml = create()
|
|
11
11
|
.ele("Goal")
|
|
12
|
-
.txt("Transform the
|
|
12
|
+
.txt("Transform the input dataset(s) (JSONL with {type:'row', data:{...}} per line) into a new dataset strictly matching the output schema. Save to output.jsonl in the dataset workstation. Each line must remain a single JSON object representing one record. You may need to combine, filter, or reshape data from multiple input datasets.")
|
|
13
13
|
.up();
|
|
14
14
|
return xml.end({ prettyPrint: true, headless: true });
|
|
15
15
|
}
|
|
@@ -17,26 +17,26 @@ function buildContextSection(context) {
|
|
|
17
17
|
let xml = create()
|
|
18
18
|
.ele("Context")
|
|
19
19
|
.ele("DatasetId").txt(context.datasetId).up();
|
|
20
|
-
let
|
|
21
|
-
for (const sourceId of context.
|
|
22
|
-
|
|
20
|
+
let inputsXml = create().ele("InputDatasets");
|
|
21
|
+
for (const sourceId of context.inputDatasetIds) {
|
|
22
|
+
inputsXml = inputsXml.ele("InputDatasetId").txt(sourceId).up();
|
|
23
23
|
}
|
|
24
|
-
xml = xml.import(
|
|
24
|
+
xml = xml.import(inputsXml.first());
|
|
25
25
|
let sandboxXml = create().ele("Sandbox");
|
|
26
|
-
for (const
|
|
27
|
-
sandboxXml = sandboxXml.ele("
|
|
28
|
-
.ele("DatasetId").txt(
|
|
29
|
-
.ele("Path").txt(
|
|
26
|
+
for (const inputPathInfo of context.sandboxConfig.inputPaths) {
|
|
27
|
+
sandboxXml = sandboxXml.ele("InputFile")
|
|
28
|
+
.ele("DatasetId").txt(inputPathInfo.datasetId).up()
|
|
29
|
+
.ele("Path").txt(inputPathInfo.path).up()
|
|
30
30
|
.up();
|
|
31
31
|
}
|
|
32
32
|
sandboxXml = sandboxXml.ele("OutputPath").txt(context.sandboxConfig.outputPath).up();
|
|
33
33
|
xml = xml.import(sandboxXml.first());
|
|
34
|
-
if (context.
|
|
35
|
-
let previewsXml = create().ele("
|
|
36
|
-
for (const
|
|
37
|
-
const sp =
|
|
38
|
-
let px = create().ele("
|
|
39
|
-
.ele("DatasetId").txt(
|
|
34
|
+
if (context.inputPreviews && context.inputPreviews.length > 0) {
|
|
35
|
+
let previewsXml = create().ele("InputPreviews");
|
|
36
|
+
for (const inputPreviewInfo of context.inputPreviews) {
|
|
37
|
+
const sp = inputPreviewInfo.preview;
|
|
38
|
+
let px = create().ele("InputPreview")
|
|
39
|
+
.ele("DatasetId").txt(inputPreviewInfo.datasetId).up()
|
|
40
40
|
.ele("TotalRows").txt(String(sp.totalRows)).up();
|
|
41
41
|
if (sp.metadata) {
|
|
42
42
|
const m = sp.metadata;
|
|
@@ -86,21 +86,21 @@ function buildOutputSchemaSection(context) {
|
|
|
86
86
|
}
|
|
87
87
|
function buildInstructions(context) {
|
|
88
88
|
const outputPath = context.sandboxConfig.outputPath;
|
|
89
|
-
const
|
|
90
|
-
? "You have multiple
|
|
89
|
+
const multipleInputsNote = context.inputDatasetIds.length > 1
|
|
90
|
+
? "You have multiple input datasets available. You may need to read, join, filter, or combine data from them to produce the output."
|
|
91
91
|
: "";
|
|
92
92
|
let xml = create()
|
|
93
93
|
.ele("Instructions")
|
|
94
94
|
.ele("Workflow")
|
|
95
|
-
.ele("Step", { number: "1", name: "Inspect
|
|
96
|
-
.ele("Action").txt(`Review
|
|
95
|
+
.ele("Step", { number: "1", name: "Inspect Inputs" })
|
|
96
|
+
.ele("Action").txt(`Review InputPreviews to understand current record structures (data fields, shapes, edge cases). ${multipleInputsNote}`).up()
|
|
97
97
|
.up()
|
|
98
98
|
.ele("Step", { number: "2", name: "Plan Mapping" })
|
|
99
|
-
.ele("Action").txt("Plan a deterministic mapping from
|
|
100
|
-
.ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple
|
|
99
|
+
.ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
|
|
100
|
+
.ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
|
|
101
101
|
.up()
|
|
102
102
|
.ele("Step", { number: "3", name: "Transform" })
|
|
103
|
-
.ele("Action").txt("Use executeCommand to run a Python script that reads
|
|
103
|
+
.ele("Action").txt("Use executeCommand to run a Python script that reads input JSONL file(s) and writes transformed records to output.jsonl. Keep line-per-record JSON objects with { 'type': 'row', 'data': { ... } }.").up()
|
|
104
104
|
.ele("Requirement").txt(`Write file to: ${outputPath}`).up()
|
|
105
105
|
.ele("Requirement").txt("Every data object MUST use the exact property names from OutputSchema required/properties keys. Do not translate, localize, rename, or infer alternative field names.").up()
|
|
106
106
|
.ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
|
|
@@ -112,12 +112,12 @@ function buildInstructions(context) {
|
|
|
112
112
|
.up()
|
|
113
113
|
.ele("Rules")
|
|
114
114
|
.ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
|
|
115
|
-
.ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve
|
|
115
|
+
.ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
|
|
116
116
|
.ele("Rule").txt("Each line in output.jsonl must be a standalone JSON object with {type:'row', data:{...}}.").up()
|
|
117
117
|
.ele("Rule").txt("Do not include headers, summaries, or metadata as records.").up()
|
|
118
|
-
.ele("Rule").txt("Be robust to malformed lines in
|
|
118
|
+
.ele("Rule").txt("Be robust to malformed lines in input: skip or sanitize, but do not crash.").up()
|
|
119
119
|
.up()
|
|
120
|
-
.ele("CurrentTask").txt("Transform
|
|
120
|
+
.ele("CurrentTask").txt("Transform input dataset(s) to match OutputSchema and write output.jsonl, then complete.").up()
|
|
121
121
|
.up();
|
|
122
122
|
return xml.end({ prettyPrint: true, headless: true });
|
|
123
123
|
}
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { type ContextReactor } from "@ekairos/events";
|
|
2
|
-
import type { TransformDatasetRunOptions, TransformSandboxState,
|
|
2
|
+
import type { TransformDatasetRunOptions, TransformSandboxState, TransformInputPreviewContext } from "./transform-dataset.types.js";
|
|
3
3
|
export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
|
|
4
4
|
export declare function createTransformDatasetContext<Env extends {
|
|
5
5
|
orgId: string;
|
|
6
6
|
}>(params: {
|
|
7
|
-
|
|
7
|
+
inputDatasetIds: string[];
|
|
8
8
|
outputSchema: any;
|
|
9
9
|
instructions?: string;
|
|
10
10
|
datasetId?: string;
|
|
@@ -12,9 +12,9 @@ export declare function createTransformDatasetContext<Env extends {
|
|
|
12
12
|
sandboxId?: string;
|
|
13
13
|
reactor?: ContextReactor<any, any>;
|
|
14
14
|
sandboxState?: TransformSandboxState;
|
|
15
|
-
|
|
15
|
+
inputPreviews?: Array<{
|
|
16
16
|
datasetId: string;
|
|
17
|
-
preview:
|
|
17
|
+
preview: TransformInputPreviewContext;
|
|
18
18
|
}>;
|
|
19
19
|
}): {
|
|
20
20
|
datasetId: string;
|
|
@@ -4,7 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
|
|
|
4
4
|
import { datasetUpdateSchemaStep } from "../dataset/steps.js";
|
|
5
5
|
import { getDatasetOutputPath } from "../datasetFiles.js";
|
|
6
6
|
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
7
|
-
import { buildTransformDatasetPromptStep,
|
|
7
|
+
import { buildTransformDatasetPromptStep, ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "./transform-dataset.steps.js";
|
|
8
8
|
import { createDatasetId } from "../id.js";
|
|
9
9
|
async function awaitContextRun(run) {
|
|
10
10
|
if (!run)
|
|
@@ -22,12 +22,12 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
22
22
|
.context(async (stored, _env, runtime) => {
|
|
23
23
|
const previous = stored?.content ?? {};
|
|
24
24
|
const sandboxState = previous?.sandboxState ??
|
|
25
|
-
params.sandboxState ?? { initialized: false,
|
|
25
|
+
params.sandboxState ?? { initialized: false, inputPaths: [] };
|
|
26
26
|
const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
|
|
27
|
-
const
|
|
28
|
-
? previous.
|
|
29
|
-
: Array.isArray(params.
|
|
30
|
-
? params.
|
|
27
|
+
const inputDatasetIds = Array.isArray(previous?.inputDatasetIds)
|
|
28
|
+
? previous.inputDatasetIds
|
|
29
|
+
: Array.isArray(params.inputDatasetIds)
|
|
30
|
+
? params.inputDatasetIds
|
|
31
31
|
: [];
|
|
32
32
|
const outputSchema = previous?.outputSchema ?? params.outputSchema;
|
|
33
33
|
const instructions = previous?.instructions ?? params.instructions;
|
|
@@ -35,8 +35,8 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
35
35
|
if (!datasetId) {
|
|
36
36
|
throw new Error("dataset_id_required");
|
|
37
37
|
}
|
|
38
|
-
if (
|
|
39
|
-
throw new Error("
|
|
38
|
+
if (inputDatasetIds.length === 0) {
|
|
39
|
+
throw new Error("dataset_transform_inputs_required");
|
|
40
40
|
}
|
|
41
41
|
if (!outputSchema) {
|
|
42
42
|
throw new Error("dataset_transform_schema_required");
|
|
@@ -44,26 +44,26 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
44
44
|
if (!sandboxId) {
|
|
45
45
|
throw new Error("dataset_sandbox_required");
|
|
46
46
|
}
|
|
47
|
-
const initialized = sandboxState.initialized && Array.isArray(sandboxState.
|
|
47
|
+
const initialized = sandboxState.initialized && Array.isArray(sandboxState.inputPaths)
|
|
48
48
|
? {
|
|
49
|
-
|
|
49
|
+
inputPaths: sandboxState.inputPaths,
|
|
50
50
|
outputPath: previous?.sandboxConfig?.outputPath ?? getDatasetOutputPath(datasetId),
|
|
51
51
|
state: sandboxState,
|
|
52
52
|
}
|
|
53
|
-
: await
|
|
53
|
+
: await ensureTransformInputsInSandboxStep({
|
|
54
54
|
runtime,
|
|
55
55
|
sandboxId,
|
|
56
56
|
datasetId,
|
|
57
|
-
|
|
57
|
+
inputDatasetIds,
|
|
58
58
|
state: sandboxState,
|
|
59
59
|
});
|
|
60
|
-
let
|
|
61
|
-
if (!
|
|
62
|
-
|
|
60
|
+
let inputPreviews = previous?.inputPreviews ?? params.inputPreviews ?? undefined;
|
|
61
|
+
if (!inputPreviews) {
|
|
62
|
+
inputPreviews = await generateTransformInputPreviewsStep({
|
|
63
63
|
runtime,
|
|
64
64
|
sandboxId,
|
|
65
65
|
datasetId,
|
|
66
|
-
|
|
66
|
+
inputPaths: initialized.inputPaths,
|
|
67
67
|
});
|
|
68
68
|
}
|
|
69
69
|
await datasetUpdateSchemaStep({
|
|
@@ -74,13 +74,13 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
74
74
|
});
|
|
75
75
|
const promptContext = {
|
|
76
76
|
datasetId,
|
|
77
|
-
|
|
77
|
+
inputDatasetIds,
|
|
78
78
|
outputSchema,
|
|
79
79
|
sandboxConfig: {
|
|
80
|
-
|
|
80
|
+
inputPaths: initialized.inputPaths,
|
|
81
81
|
outputPath: initialized.outputPath,
|
|
82
82
|
},
|
|
83
|
-
|
|
83
|
+
inputPreviews: inputPreviews.length > 0 ? inputPreviews : undefined,
|
|
84
84
|
errors: [],
|
|
85
85
|
};
|
|
86
86
|
const basePrompt = await buildTransformDatasetPromptStep({
|
|
@@ -100,14 +100,14 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
100
100
|
return {
|
|
101
101
|
...previous,
|
|
102
102
|
datasetId,
|
|
103
|
-
|
|
103
|
+
inputDatasetIds,
|
|
104
104
|
outputSchema,
|
|
105
105
|
instructions,
|
|
106
106
|
sandboxId,
|
|
107
107
|
sandboxState: initialized.state,
|
|
108
108
|
system,
|
|
109
109
|
sandboxConfig: {
|
|
110
|
-
|
|
110
|
+
inputPaths: initialized.inputPaths,
|
|
111
111
|
outputPath: initialized.outputPath,
|
|
112
112
|
},
|
|
113
113
|
};
|
|
@@ -159,7 +159,7 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
159
159
|
export function createTransformDatasetContext(params) {
|
|
160
160
|
const datasetId = params.datasetId ?? createDatasetId();
|
|
161
161
|
const { context } = createTransformDatasetContextDefinition({
|
|
162
|
-
|
|
162
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
163
163
|
outputSchema: params.outputSchema,
|
|
164
164
|
instructions: params.instructions,
|
|
165
165
|
datasetId,
|
|
@@ -167,14 +167,14 @@ export function createTransformDatasetContext(params) {
|
|
|
167
167
|
sandboxId: params.sandboxId,
|
|
168
168
|
reactor: params.reactor,
|
|
169
169
|
sandboxState: params.sandboxState,
|
|
170
|
-
|
|
170
|
+
inputPreviews: params.inputPreviews,
|
|
171
171
|
});
|
|
172
172
|
return {
|
|
173
173
|
datasetId,
|
|
174
174
|
async transform(runtime, options = {}) {
|
|
175
|
-
const datasetCountText = params.
|
|
176
|
-
? "the
|
|
177
|
-
: `${params.
|
|
175
|
+
const datasetCountText = params.inputDatasetIds.length === 1
|
|
176
|
+
? "the input dataset"
|
|
177
|
+
: `${params.inputDatasetIds.length} input datasets`;
|
|
178
178
|
const triggerEvent = {
|
|
179
179
|
id: createDatasetId(),
|
|
180
180
|
type: INPUT_TEXT_ITEM_TYPE,
|
|
@@ -195,7 +195,6 @@ export function createTransformDatasetContext(params) {
|
|
|
195
195
|
context: { key: `dataset:${datasetId}` },
|
|
196
196
|
durable: options.durable ?? false,
|
|
197
197
|
options: {
|
|
198
|
-
silent: true,
|
|
199
198
|
preventClose: true,
|
|
200
199
|
sendFinish: false,
|
|
201
200
|
maxIterations: 20,
|
|
@@ -204,12 +203,12 @@ export function createTransformDatasetContext(params) {
|
|
|
204
203
|
__initialContent: {
|
|
205
204
|
...(options.initialContent ?? {}),
|
|
206
205
|
datasetId,
|
|
207
|
-
|
|
206
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
208
207
|
outputSchema: params.outputSchema,
|
|
209
208
|
instructions: params.instructions,
|
|
210
209
|
sandboxId: params.sandboxId ?? "",
|
|
211
|
-
sandboxState: params.sandboxState ?? { initialized: false,
|
|
212
|
-
|
|
210
|
+
sandboxState: params.sandboxState ?? { initialized: false, inputPaths: [] },
|
|
211
|
+
inputPreviews: params.inputPreviews,
|
|
213
212
|
},
|
|
214
213
|
});
|
|
215
214
|
await awaitContextRun(shell.run);
|