@ekairos/dataset 1.22.78-beta.development.0 → 1.22.80-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,7 +89,8 @@ function buildErrorsSection(errors) {
89
89
  return null;
90
90
  }
91
91
  let xml = create()
92
- .ele("PreviousErrors");
92
+ .ele("PreviousErrors")
93
+ .ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch source column names into schema keys piecemeal.").up();
93
94
  for (const error of errors) {
94
95
  xml = xml.ele("Error").txt(error).up();
95
96
  }
@@ -114,16 +115,110 @@ function buildContextSection(context) {
114
115
  xml = xml.up();
115
116
  return xml.end({ prettyPrint: true, headless: true });
116
117
  }
118
+ function asRecord(value) {
119
+ return value && typeof value === "object" && !Array.isArray(value)
120
+ ? value
121
+ : null;
122
+ }
123
+ function getSchemaObject(context) {
124
+ return asRecord(context.schema?.schema);
125
+ }
126
+ function joinSchemaPath(basePath, key) {
127
+ return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
128
+ }
129
+ function collectSchemaContract(schema, path = "$", contract = {
130
+ requiredPaths: [],
131
+ propertyPaths: [],
132
+ enumConstraints: [],
133
+ closedObjectPaths: [],
134
+ }) {
135
+ const record = asRecord(schema);
136
+ if (!record) {
137
+ return contract;
138
+ }
139
+ if (Array.isArray(record.enum)) {
140
+ contract.enumConstraints.push({
141
+ path,
142
+ values: record.enum.map((value) => JSON.stringify(value)),
143
+ });
144
+ }
145
+ const properties = asRecord(record.properties);
146
+ if (properties) {
147
+ if (record.additionalProperties === false) {
148
+ contract.closedObjectPaths.push(path);
149
+ }
150
+ const required = Array.isArray(record.required)
151
+ ? record.required.filter((value) => typeof value === "string")
152
+ : [];
153
+ for (const key of required) {
154
+ contract.requiredPaths.push(joinSchemaPath(path, key));
155
+ }
156
+ for (const [key, childSchema] of Object.entries(properties)) {
157
+ const childPath = joinSchemaPath(path, key);
158
+ contract.propertyPaths.push(childPath);
159
+ collectSchemaContract(childSchema, childPath, contract);
160
+ }
161
+ }
162
+ if (record.items) {
163
+ collectSchemaContract(record.items, `${path}[]`, contract);
164
+ }
165
+ for (const keyword of ["oneOf", "anyOf", "allOf"]) {
166
+ if (Array.isArray(record[keyword])) {
167
+ for (const childSchema of record[keyword]) {
168
+ collectSchemaContract(childSchema, path, contract);
169
+ }
170
+ }
171
+ }
172
+ return contract;
173
+ }
174
+ function appendLimitedList(xml, elementName, itemName, values, maxItems) {
175
+ let node = xml.ele(elementName);
176
+ for (const value of values.slice(0, maxItems)) {
177
+ node = node.ele(itemName).txt(value).up();
178
+ }
179
+ if (values.length > maxItems) {
180
+ node = node.ele("Truncated").txt(String(values.length - maxItems)).up();
181
+ }
182
+ return node.up();
183
+ }
117
184
  function buildSchemaSection(context) {
118
- if (!context.schema) {
185
+ const schema = getSchemaObject(context);
186
+ if (!context.schema || !schema) {
119
187
  return "";
120
188
  }
189
+ const contract = collectSchemaContract(schema);
121
190
  let xml = create()
122
191
  .com("Schema section: This defines the structure of ONE RECORD (row). Each line in the JSONL output must conform to this schema.")
123
192
  .ele("Schema")
124
193
  .ele("Title").txt(context.schema.title || "").up()
125
- .ele("Description").txt(context.schema.description || "").up()
126
- .ele("JsonSchema").txt(JSON.stringify(context.schema.schema, null, 2)).up()
194
+ .ele("Description").txt(context.schema.description || "").up();
195
+ xml = xml
196
+ .ele("SchemaContract")
197
+ .ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
198
+ .ele("Rule").txt("Use only schema property keys in data objects. Source headers are input labels, not output keys.").up()
199
+ .ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
200
+ .ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize source labels to the closest valid enum literal; never emit a value outside the enum.").up();
201
+ xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
202
+ xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
203
+ let enumsXml = xml.ele("EnumConstraints");
204
+ for (const constraint of contract.enumConstraints.slice(0, 80)) {
205
+ let enumXml = enumsXml.ele("Enum", { path: constraint.path });
206
+ for (const value of constraint.values.slice(0, 80)) {
207
+ enumXml = enumXml.ele("Value").txt(value).up();
208
+ }
209
+ if (constraint.values.length > 80) {
210
+ enumXml = enumXml.ele("Truncated").txt(String(constraint.values.length - 80)).up();
211
+ }
212
+ enumsXml = enumXml.up();
213
+ }
214
+ if (contract.enumConstraints.length > 80) {
215
+ enumsXml = enumsXml.ele("Truncated").txt(String(contract.enumConstraints.length - 80)).up();
216
+ }
217
+ xml = enumsXml.up();
218
+ xml = appendLimitedList(xml, "ClosedObjectPaths", "Path", contract.closedObjectPaths, 80);
219
+ xml = xml
220
+ .up()
221
+ .ele("JsonSchema").txt(JSON.stringify(schema, null, 2)).up()
127
222
  .up();
128
223
  return xml.end({ prettyPrint: true, headless: true });
129
224
  }
@@ -148,6 +243,9 @@ function buildInstructions(context) {
148
243
  .ele("Requirements")
149
244
  .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
150
245
  .ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
246
+ .ele("Requirement").txt("Build a schema-first mapping from source columns to schema fields before writing output.jsonl. Do not use raw source headers as JSON keys unless they are exactly schema keys").up()
247
+ .ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
248
+ .ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
151
249
  .ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the source language").up()
152
250
  .ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
153
251
  .up()
@@ -173,6 +271,7 @@ function buildInstructions(context) {
173
271
  .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
174
272
  .ele("Requirement").txt("Output JSONL format: each line is {\"type\": \"row\", \"data\": {...record...}}").up()
175
273
  .ele("Requirement").txt("When a schema is provided, each data object must contain the exact required schema keys and must not use translated or synonymous keys").up()
274
+ .ele("Requirement").txt("When validation returns zero valid rows, treat the previous output as structurally wrong and rewrite output.jsonl from the SchemaContract, not by applying small patches").up()
176
275
  .ele("Requirement").txt("Extract ONLY data records; skip any header lines, summary sections, or file metadata").up()
177
276
  .ele("Requirement").txt(`Save output to: ${outputPath}`).up()
178
277
  .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
@@ -207,6 +306,11 @@ export function buildFileDatasetPrompt(context) {
207
306
  sections.push("");
208
307
  sections.push(buildContextSection(context));
209
308
  sections.push("");
309
+ const schemaSection = buildSchemaSection(context);
310
+ if (schemaSection) {
311
+ sections.push(schemaSection);
312
+ sections.push("");
313
+ }
210
314
  sections.push(buildInstructions(context));
211
315
  return sections.join("\n");
212
316
  }
@@ -1,9 +1,8 @@
1
- import { getDatasetWorkstation } from "../datasetFiles.js";
1
+ import { getDatasetScriptsDir } from "../datasetFiles.js";
2
2
  import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
3
3
  const DEFAULT_HEAD_LINES = 50;
4
4
  async function runPythonSnippet(runtime, sandboxId, datasetId, scriptName, code, args, description) {
5
- const workstation = getDatasetWorkstation(datasetId);
6
- const scriptPath = `${workstation}/${scriptName}.py`;
5
+ const scriptPath = `${getDatasetScriptsDir(datasetId)}/${scriptName}.py`;
7
6
  await writeDatasetSandboxFilesStep({
8
7
  runtime,
9
8
  sandboxId,
@@ -1,5 +1,5 @@
1
1
  import { type ContextReactor } from "@ekairos/events";
2
- import type { TransformDatasetRunOptions } from "./transform-dataset.types.js";
2
+ import type { TransformDatasetRunOptions, TransformSandboxState, TransformSourcePreviewContext } from "./transform-dataset.types.js";
3
3
  export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
4
4
  export declare function createTransformDatasetContext<Env extends {
5
5
  orgId: string;
@@ -11,6 +11,11 @@ export declare function createTransformDatasetContext<Env extends {
11
11
  model?: string;
12
12
  sandboxId?: string;
13
13
  reactor?: ContextReactor<any, any>;
14
+ sandboxState?: TransformSandboxState;
15
+ sourcePreviews?: Array<{
16
+ datasetId: string;
17
+ preview: TransformSourcePreviewContext;
18
+ }>;
14
19
  }): {
15
20
  datasetId: string;
16
21
  transform(runtime: {
@@ -2,6 +2,7 @@ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/even
2
2
  import { createClearDatasetTool } from "../clearDataset.tool.js";
3
3
  import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
4
4
  import { datasetUpdateSchemaStep } from "../dataset/steps.js";
5
+ import { getDatasetOutputPath } from "../datasetFiles.js";
5
6
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
7
  import { buildTransformDatasetPromptStep, ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "./transform-dataset.steps.js";
7
8
  import { createDatasetId } from "../id.js";
@@ -20,7 +21,8 @@ function createTransformDatasetContextDefinition(params) {
20
21
  let contextBuilder = createContext("dataset.transform")
21
22
  .context(async (stored, _env, runtime) => {
22
23
  const previous = stored?.content ?? {};
23
- const sandboxState = previous?.sandboxState ?? { initialized: false, sourcePaths: [] };
24
+ const sandboxState = previous?.sandboxState ??
25
+ params.sandboxState ?? { initialized: false, sourcePaths: [] };
24
26
  const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
25
27
  const sourceDatasetIds = Array.isArray(previous?.sourceDatasetIds)
26
28
  ? previous.sourceDatasetIds
@@ -42,19 +44,28 @@ function createTransformDatasetContextDefinition(params) {
42
44
  if (!sandboxId) {
43
45
  throw new Error("dataset_sandbox_required");
44
46
  }
45
- const initialized = await ensureTransformSourcesInSandboxStep({
46
- runtime,
47
- sandboxId,
48
- datasetId,
49
- sourceDatasetIds,
50
- state: sandboxState,
51
- });
52
- const sourcePreviews = await generateTransformSourcePreviewsStep({
53
- runtime,
54
- sandboxId,
55
- datasetId,
56
- sourcePaths: initialized.sourcePaths,
57
- });
47
+ const initialized = sandboxState.initialized && Array.isArray(sandboxState.sourcePaths)
48
+ ? {
49
+ sourcePaths: sandboxState.sourcePaths,
50
+ outputPath: previous?.sandboxConfig?.outputPath ?? getDatasetOutputPath(datasetId),
51
+ state: sandboxState,
52
+ }
53
+ : await ensureTransformSourcesInSandboxStep({
54
+ runtime,
55
+ sandboxId,
56
+ datasetId,
57
+ sourceDatasetIds,
58
+ state: sandboxState,
59
+ });
60
+ let sourcePreviews = previous?.sourcePreviews ?? params.sourcePreviews ?? undefined;
61
+ if (!sourcePreviews) {
62
+ sourcePreviews = await generateTransformSourcePreviewsStep({
63
+ runtime,
64
+ sandboxId,
65
+ datasetId,
66
+ sourcePaths: initialized.sourcePaths,
67
+ });
68
+ }
58
69
  await datasetUpdateSchemaStep({
59
70
  runtime,
60
71
  datasetId,
@@ -155,6 +166,8 @@ export function createTransformDatasetContext(params) {
155
166
  model: params.model,
156
167
  sandboxId: params.sandboxId,
157
168
  reactor: params.reactor,
169
+ sandboxState: params.sandboxState,
170
+ sourcePreviews: params.sourcePreviews,
158
171
  });
159
172
  return {
160
173
  datasetId,
@@ -189,12 +202,14 @@ export function createTransformDatasetContext(params) {
189
202
  maxModelSteps: 5,
190
203
  },
191
204
  __initialContent: {
205
+ ...(options.initialContent ?? {}),
192
206
  datasetId,
193
207
  sourceDatasetIds: params.sourceDatasetIds,
194
208
  outputSchema: params.outputSchema,
195
209
  instructions: params.instructions,
196
210
  sandboxId: params.sandboxId ?? "",
197
- sandboxState: { initialized: false, sourcePaths: [] },
211
+ sandboxState: params.sandboxState ?? { initialized: false, sourcePaths: [] },
212
+ sourcePreviews: params.sourcePreviews,
198
213
  },
199
214
  });
200
215
  await awaitContextRun(shell.run);
@@ -1,4 +1,4 @@
1
- import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
1
+ import { getDatasetOutputPath, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
2
2
  import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
3
3
  import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
4
4
  import { generateSourcePreview } from "./filepreview.js";
@@ -12,16 +12,15 @@ export async function ensureTransformSourcesInSandboxStep(params) {
12
12
  state: params.state,
13
13
  };
14
14
  }
15
- const workstation = getDatasetWorkstation(params.datasetId);
16
15
  await runDatasetSandboxCommandStep({
17
16
  runtime: params.runtime,
18
17
  sandboxId: params.sandboxId,
19
18
  cmd: "mkdir",
20
- args: ["-p", workstation],
19
+ args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
21
20
  });
22
21
  const sourcePaths = [];
23
22
  for (const sourceDatasetId of params.sourceDatasetIds) {
24
- const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
23
+ const sourcePath = `${getDatasetSourcesDir(params.datasetId)}/source_${sourceDatasetId}.jsonl`;
25
24
  const source = await datasetReadOutputJsonlStep({
26
25
  runtime: params.runtime,
27
26
  datasetId: sourceDatasetId,
@@ -35,10 +35,16 @@ export type TransformDatasetAgentParams = {
35
35
  model?: string;
36
36
  sandboxId?: string;
37
37
  reactor?: ContextReactor<any, any>;
38
+ sandboxState?: TransformSandboxState;
39
+ sourcePreviews?: Array<{
40
+ datasetId: string;
41
+ preview: TransformSourcePreviewContext;
42
+ }>;
38
43
  };
39
44
  export type TransformDatasetRunOptions = {
40
45
  prompt?: string;
41
46
  durable?: boolean;
47
+ initialContent?: Record<string, any>;
42
48
  };
43
49
  export type TransformDatasetResult = {
44
50
  id: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.78-beta.development.0",
3
+ "version": "1.22.80-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.78-beta.development.0",
69
- "@ekairos/events": "^1.22.78-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.78-beta.development.0",
68
+ "@ekairos/domain": "^1.22.80-beta.development.0",
69
+ "@ekairos/events": "^1.22.80-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.80-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",