npm - @ekairos/dataset - Versions diffs - 1.22.36-beta.development.0 → 1.22.36 - Mend

@ekairos/dataset 1.22.36-beta.development.0 → 1.22.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

package/README.md +347 -0
package/dist/agents.d.ts +8 -0
package/dist/agents.js +8 -0
package/dist/builder/agentMaterializers.d.ts +9 -0
package/dist/builder/agentMaterializers.js +10 -0
package/dist/builder/context.d.ts +15 -0
package/dist/builder/context.js +251 -0
package/dist/builder/instructions.d.ts +5 -0
package/dist/builder/instructions.js +40 -0
package/dist/builder/materialize.d.ts +83 -0
package/dist/builder/materialize.js +548 -0
package/dist/builder/materializeQuery.d.ts +12 -0
package/dist/builder/materializeQuery.js +31 -0
package/dist/builder/persistence.d.ts +22 -0
package/dist/builder/persistence.js +153 -0
package/dist/builder/rows.d.ts +7 -0
package/dist/builder/rows.js +56 -0
package/dist/builder/schemaInference.d.ts +3 -0
package/dist/builder/schemaInference.js +61 -0
package/dist/builder/types.d.ts +140 -0
package/dist/builder/types.js +1 -0
package/dist/clearDataset.tool.d.ts +2 -3
package/dist/clearDataset.tool.js +13 -17
package/dist/completeDataset.steps.d.ts +117 -0
package/dist/completeDataset.steps.js +487 -0
package/dist/completeDataset.tool.d.ts +132 -7
package/dist/completeDataset.tool.js +46 -192
package/dist/contextResources.d.ts +31 -0
package/dist/contextResources.js +151 -0
package/dist/contextWorkspace.d.ts +79 -0
package/dist/contextWorkspace.js +234 -0
package/dist/dataset/steps.d.ts +39 -15
package/dist/dataset/steps.js +96 -39
package/dist/dataset.d.ts +3 -67
package/dist/dataset.js +129 -521
package/dist/datasetFiles.d.ts +5 -1
package/dist/datasetFiles.js +29 -27
package/dist/domain.d.ts +1 -2
package/dist/domain.js +1 -6
package/dist/executeCommand.tool.d.ts +2 -30
package/dist/executeCommand.tool.js +165 -39
package/dist/file/file-dataset.agent.d.ts +19 -56
package/dist/file/file-dataset.agent.js +176 -134
package/dist/file/file-dataset.steps.d.ts +27 -0
package/dist/file/file-dataset.steps.js +47 -0
package/dist/file/file-dataset.types.d.ts +64 -0
package/dist/file/file-dataset.types.js +1 -0
package/dist/file/filepreview.d.ts +5 -35
package/dist/file/filepreview.js +60 -107
package/dist/file/filepreview.types.d.ts +31 -0
package/dist/file/filepreview.types.js +1 -0
package/dist/file/generateSchema.tool.d.ts +2 -3
package/dist/file/generateSchema.tool.js +11 -15
package/dist/file/index.d.ts +1 -2
package/dist/file/index.js +1 -18
package/dist/file/prompts.d.ts +2 -3
package/dist/file/prompts.js +134 -27
package/dist/file/scripts.generated.d.ts +1 -0
package/dist/file/scripts.generated.js +11 -0
package/dist/file/steps.d.ts +1 -2
package/dist/file/steps.js +9 -7
package/dist/id.d.ts +1 -0
package/dist/id.js +10 -0
package/dist/index.d.ts +8 -7
package/dist/index.js +8 -23
package/dist/materializeDataset.tool.d.ts +52 -32
package/dist/materializeDataset.tool.js +81 -65
package/dist/query/index.d.ts +1 -2
package/dist/query/index.js +1 -18
package/dist/query/queryDomain.d.ts +3 -4
package/dist/query/queryDomain.js +3 -40
package/dist/query/queryDomain.step.d.ts +1 -1
package/dist/query/queryDomain.step.js +13 -13
package/dist/sandbox/steps.d.ts +23 -15
package/dist/sandbox/steps.js +73 -76
package/dist/sandbox.steps.d.ts +1 -2
package/dist/sandbox.steps.js +1 -18
package/dist/schema.d.ts +13 -13
package/dist/schema.js +25 -37
package/dist/service.d.ts +8 -5
package/dist/service.js +70 -15
package/dist/skill.d.ts +0 -1
package/dist/skill.js +12 -17
package/dist/transform/filepreview.d.ts +2 -3
package/dist/transform/filepreview.js +9 -26
package/dist/transform/index.d.ts +2 -3
package/dist/transform/index.js +2 -8
package/dist/transform/prompts.d.ts +1 -34
package/dist/transform/prompts.js +58 -43
package/dist/transform/transform-dataset.agent.d.ts +20 -45
package/dist/transform/transform-dataset.agent.js +146 -91
package/dist/transform/transform-dataset.steps.d.ts +30 -0
package/dist/transform/transform-dataset.steps.js +61 -0
package/dist/transform/transform-dataset.types.d.ts +95 -0
package/dist/transform/transform-dataset.types.js +1 -0
package/dist/transform/transformDataset.d.ts +3 -3
package/dist/transform/transformDataset.js +15 -18
package/dist/writeDatasetRows.tool.d.ts +188 -0
package/dist/writeDatasetRows.tool.js +258 -0
package/package.json +35 -10
package/dist/clearDataset.tool.d.ts.map +0 -1
package/dist/clearDataset.tool.js.map +0 -1
package/dist/completeDataset.tool.d.ts.map +0 -1
package/dist/completeDataset.tool.js.map +0 -1
package/dist/dataset/steps.d.ts.map +0 -1
package/dist/dataset/steps.js.map +0 -1
package/dist/dataset.d.ts.map +0 -1
package/dist/dataset.js.map +0 -1
package/dist/datasetFiles.d.ts.map +0 -1
package/dist/datasetFiles.js.map +0 -1
package/dist/domain.d.ts.map +0 -1
package/dist/domain.js.map +0 -1
package/dist/eventsReactRuntime.d.ts +0 -22
package/dist/eventsReactRuntime.d.ts.map +0 -1
package/dist/eventsReactRuntime.js +0 -29
package/dist/eventsReactRuntime.js.map +0 -1
package/dist/executeCommand.tool.d.ts.map +0 -1
package/dist/executeCommand.tool.js.map +0 -1
package/dist/file/file-dataset.agent.d.ts.map +0 -1
package/dist/file/file-dataset.agent.js.map +0 -1
package/dist/file/filepreview.d.ts.map +0 -1
package/dist/file/filepreview.js.map +0 -1
package/dist/file/generateSchema.tool.d.ts.map +0 -1
package/dist/file/generateSchema.tool.js.map +0 -1
package/dist/file/index.d.ts.map +0 -1
package/dist/file/index.js.map +0 -1
package/dist/file/prompts.d.ts.map +0 -1
package/dist/file/prompts.js.map +0 -1
package/dist/file/steps.d.ts.map +0 -1
package/dist/file/steps.js.map +0 -1
package/dist/index.d.ts.map +0 -1
package/dist/index.js.map +0 -1
package/dist/materializeDataset.tool.d.ts.map +0 -1
package/dist/materializeDataset.tool.js.map +0 -1
package/dist/query/index.d.ts.map +0 -1
package/dist/query/index.js.map +0 -1
package/dist/query/queryDomain.d.ts.map +0 -1
package/dist/query/queryDomain.js.map +0 -1
package/dist/query/queryDomain.step.d.ts.map +0 -1
package/dist/query/queryDomain.step.js.map +0 -1
package/dist/sandbox/steps.d.ts.map +0 -1
package/dist/sandbox/steps.js.map +0 -1
package/dist/sandbox.steps.d.ts.map +0 -1
package/dist/sandbox.steps.js.map +0 -1
package/dist/schema.d.ts.map +0 -1
package/dist/schema.js.map +0 -1
package/dist/service.d.ts.map +0 -1
package/dist/service.js.map +0 -1
package/dist/skill.d.ts.map +0 -1
package/dist/skill.js.map +0 -1
package/dist/transform/filepreview.d.ts.map +0 -1
package/dist/transform/filepreview.js.map +0 -1
package/dist/transform/index.d.ts.map +0 -1
package/dist/transform/index.js.map +0 -1
package/dist/transform/prompts.d.ts.map +0 -1
package/dist/transform/prompts.js.map +0 -1
package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
package/dist/transform/transform-dataset.agent.js.map +0 -1
package/dist/transform/transformDataset.d.ts.map +0 -1
package/dist/transform/transformDataset.js.map +0 -1

package/dist/file/filepreview.js CHANGED Viewed

@@ -1,30 +1,38 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.ensurePreviewScriptsAvailable = ensurePreviewScriptsAvailable;
-exports.generateFilePreview = generateFilePreview;
-const fs_1 = require("fs");
-const path_1 = require("path");
-const steps_1 = require("../sandbox/steps");
+import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
+import { PYTHON_SCRIPT_BASE64_BY_NAME } from "./scripts.generated.js";
 const DEFAULT_HEAD_LINES = 50;
 const DEFAULT_TAIL_LINES = 20;
 const DEFAULT_MID_LINES = 20;
-const SANDBOX_SCRIPT_DIRECTORY = "/tmp/ekairos/dataset/file/scripts";
-const PYTHON_SCRIPT_FILES = [
-    "file_metadata.py",
-    "preview_head_csv.py",
-    "preview_head_excel.py",
-    "preview_mid_csv.py",
-    "preview_mid_excel.py",
-    "preview_tail_csv.py",
-    "preview_tail_excel.py",
-];
-function resolveScriptPath(scriptName) {
-    // Prefer local scripts in src/ (tests/dev), and after build the scripts are copied to dist/
-    // at the same relative path, so this works in both environments.
-    return (0, path_1.join)(__dirname, "scripts", scriptName);
+export function getEmbeddedFilePreviewScriptBase64(scriptName) {
+    const embedded = PYTHON_SCRIPT_BASE64_BY_NAME[scriptName];
+    if (!embedded) {
+        throw new Error(`dataset_preview_script_not_embedded:${scriptName}`);
+    }
+    return embedded;
+}
+function readFilePreviewScriptText(scriptName) {
+    return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
+}
+function sanitizePreviewText(value) {
+    return String(value ?? "")
+        .replace(/\u0000/g, "")
+        .replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "");
+}
+function getPreviewKind(extension) {
+    const normalized = extension.toLowerCase();
+    if (normalized === ".xlsx" || normalized === ".xls")
+        return "excel";
+    if (normalized === ".csv" ||
+        normalized === ".tsv" ||
+        normalized === ".txt" ||
+        normalized === ".log" ||
+        normalized === ".json" ||
+        normalized === ".jsonl" ||
+        normalized === ".md") {
+        return "text";
+    }
+    return null;
 }
-const preparedSandboxIds = new Set();
-const sandboxSetupPromises = new Map();
 function validateScriptResult(result, context) {
     if (!result.stderr) {
         return;
@@ -37,75 +45,24 @@ function validateScriptResult(result, context) {
         throw new Error(`${context} failed: ${stderr.substring(0, 500)}`);
     }
 }
-async function ensurePreviewScriptsAvailable(env, sandboxId) {
-    if (preparedSandboxIds.has(sandboxId)) {
-        return;
-    }
-    const inFlight = sandboxSetupPromises.get(sandboxId);
-    if (inFlight) {
-        await inFlight;
-        return;
-    }
-    const setupPromise = (async () => {
-        try {
-            await (0, steps_1.runDatasetSandboxCommandStep)({
-                env,
-                sandboxId,
-                cmd: "mkdir",
-                args: ["-p", SANDBOX_SCRIPT_DIRECTORY],
-            });
-        }
-        catch (error) {
-            console.warn("[Dataset Scripts] Failed to create sandbox scripts directory", error);
-        }
-        const filesToWrite = [];
-        for (const scriptName of PYTHON_SCRIPT_FILES) {
-            try {
-                const scriptPath = resolveScriptPath(scriptName);
-                const fileBuffer = (0, fs_1.readFileSync)(scriptPath);
-                filesToWrite.push({
-                    path: `${SANDBOX_SCRIPT_DIRECTORY}/${scriptName}`,
-                    contentBase64: Buffer.from(fileBuffer).toString("base64"),
-                });
-            }
-            catch (error) {
-                console.error(`[Dataset Scripts] Failed to read script ${scriptName}`, error);
-                throw error;
-            }
-        }
-        if (filesToWrite.length > 0) {
-            await (0, steps_1.writeDatasetSandboxFilesStep)({
-                env,
-                sandboxId,
-                files: filesToWrite,
-            });
-        }
-    })();
-    sandboxSetupPromises.set(sandboxId, setupPromise);
-    try {
-        await setupPromise;
-        preparedSandboxIds.add(sandboxId);
-    }
-    catch (error) {
-        sandboxSetupPromises.delete(sandboxId);
-        throw error;
-    }
+export async function ensurePreviewScriptsAvailable(_runtime, _sandboxId) {
+    return;
 }
-async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, options = {}) {
+export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId, options = {}) {
     const context = {
         totalRows: 0,
     };
     try {
-        await ensurePreviewScriptsAvailable(env, sandboxId);
-        const metadataResult = await runScript(env, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
+        const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
+        validateScriptResult(metadataResult, `preview_metadata for ${datasetId}`);
         context.metadata = metadataResult;
-        let isExcel = false;
+        let previewKind = null;
         if (metadataResult.stdout) {
             try {
                 const metadataJson = JSON.parse(metadataResult.stdout);
                 context.totalRows = metadataJson.row_count_estimate || 0;
                 const extension = metadataJson.extension || "";
-                isExcel = extension === ".xlsx" || extension === ".xls";
+                previewKind = getPreviewKind(extension);
             }
             catch {
                 console.warn(`[Dataset ${datasetId}] Failed to parse metadata JSON`);
@@ -118,28 +75,32 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
             console.log(`[Dataset ${datasetId}] No rows detected, skipping preview`);
             return context;
         }
-        const headScript = isExcel ? "preview_head_excel.py" : "preview_head_csv.py";
-        const tailScript = isExcel ? "preview_tail_excel.py" : "preview_tail_csv.py";
-        const midScript = isExcel ? "preview_mid_excel.py" : "preview_mid_csv.py";
+        if (!previewKind) {
+            console.log(`[Dataset ${datasetId}] Binary or unsupported preview format, keeping metadata only`);
+            return context;
+        }
+        const headScript = previewKind === "excel" ? "preview_head_excel.py" : "preview_head_csv.py";
+        const tailScript = previewKind === "excel" ? "preview_tail_excel.py" : "preview_tail_csv.py";
+        const midScript = previewKind === "excel" ? "preview_mid_excel.py" : "preview_mid_csv.py";
         if (totalRows <= headLines) {
             console.log(`[Dataset ${datasetId}] File has ${totalRows} rows, reading all with head only`);
-            const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
+            const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
             validateScriptResult(headResult, `preview_head for ${datasetId}`);
             context.head = headResult;
             return context;
         }
         if (headLines + tailLines >= totalRows) {
             console.log(`[Dataset ${datasetId}] Head + tail would cover entire file (${totalRows} rows), reading all with head only`);
-            const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
+            const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
             validateScriptResult(headResult, `preview_head for ${datasetId}`);
             context.head = headResult;
             return context;
         }
         console.log(`[Dataset ${datasetId}] Reading head (${headLines} rows) and tail (${tailLines} rows) from ${totalRows} total rows`);
-        const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(headLines)], `Reads the first ${headLines} rows of the file`);
+        const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(headLines)], `Reads the first ${headLines} rows of the file`);
         validateScriptResult(headResult, `preview_head for ${datasetId}`);
         context.head = headResult;
-        const tailResult = await runScript(env, sandboxId, tailScript, [sandboxFilePath, String(tailLines)], `Reads the last ${tailLines} rows of the file`);
+        const tailResult = await runScript(runtime, sandboxId, tailScript, [sandboxFilePath, String(tailLines)], `Reads the last ${tailLines} rows of the file`);
         validateScriptResult(tailResult, `preview_tail for ${datasetId}`);
         context.tail = tailResult;
         const midLines = options.midLines || DEFAULT_MID_LINES;
@@ -148,40 +109,33 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
             const midStart = headLines;
             const midEnd = totalRows - tailLines;
             console.log(`[Dataset ${datasetId}] Large gap (${gapSize} rows), adding mid sample (${midLines} rows)`);
-            const midResult = await runScript(env, sandboxId, midScript, [sandboxFilePath, String(midStart), String(midEnd), String(midLines)], `Samples ${midLines} rows from the middle section (rows ${midStart + 1} to ${midEnd})`);
+            const midResult = await runScript(runtime, sandboxId, midScript, [sandboxFilePath, String(midStart), String(midEnd), String(midLines)], `Samples ${midLines} rows from the middle section (rows ${midStart + 1} to ${midEnd})`);
             validateScriptResult(midResult, `preview_mid for ${datasetId}`);
             context.mid = midResult;
         }
     }
     catch (error) {
         console.error(`[Dataset ${datasetId}] Error generating file preview:`, error);
+        throw error;
     }
     return context;
 }
-async function runScript(env, sandboxId, scriptName, args, description) {
-    const scriptPath = `/vercel/sandbox/lib/domain/dataset/file/scripts/${scriptName}`;
-    const command = `python ${scriptPath} ${args.join(" ")}`;
-    let scriptContent = "";
-    try {
-        const localScriptPath = resolveScriptPath(scriptName);
-        scriptContent = (0, fs_1.readFileSync)(localScriptPath, 'utf-8');
-    }
-    catch (error) {
-        console.warn(`Failed to read script ${scriptName}:`, error);
-    }
+async function runScript(runtime, sandboxId, scriptName, args, description) {
+    const scriptContent = readFilePreviewScriptText(scriptName);
+    const command = `python -c <${scriptName}> ${args.join(" ")}`;
     try {
-        const result = await (0, steps_1.runDatasetSandboxCommandStep)({
-            env,
+        const result = await runDatasetSandboxCommandStep({
+            runtime,
             sandboxId,
             cmd: "python",
-            args: [scriptPath, ...args],
+            args: ["-c", scriptContent, ...args],
         });
         return {
             description,
             script: scriptContent,
             command,
-            stdout: result.stdout || "",
-            stderr: result.stderr || "",
+            stdout: sanitizePreviewText(result.stdout),
+            stderr: sanitizePreviewText(result.stderr),
         };
     }
     catch (error) {
@@ -190,8 +144,7 @@ async function runScript(env, sandboxId, scriptName, args, description) {
             script: scriptContent,
             command,
             stdout: "",
-            stderr: error instanceof Error ? error.message : String(error),
+            stderr: sanitizePreviewText(error instanceof Error ? error.message : String(error)),
         };
     }
 }
-//# sourceMappingURL=filepreview.js.map

package/dist/file/filepreview.types.d.ts ADDED Viewed

@@ -0,0 +1,31 @@
+export type FilePreviewContext = {
+    totalRows: number;
+    metadata?: {
+        description: string;
+        script: string;
+        command: string;
+        stdout: string;
+        stderr: string;
+    };
+    head?: {
+        description: string;
+        script: string;
+        command: string;
+        stdout: string;
+        stderr: string;
+    };
+    tail?: {
+        description: string;
+        script: string;
+        command: string;
+        stdout: string;
+        stderr: string;
+    };
+    mid?: {
+        description: string;
+        script: string;
+        command: string;
+        stdout: string;
+        stderr: string;
+    };
+};

package/dist/file/filepreview.types.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/file/generateSchema.tool.d.ts CHANGED Viewed

@@ -2,9 +2,9 @@ interface GenerateSchemaToolParams {
     datasetId: string;
     isNested?: boolean;
     fileId?: string;
-    env: any;
+    runtime: any;
 }
-export declare function createGenerateSchemaTool({ datasetId, isNested, fileId, env }: GenerateSchemaToolParams): import("ai").Tool<{
+export declare function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }: GenerateSchemaToolParams): import("ai").Tool<{
     schemaTitle: string;
     schemaDescription: string;
     schemaJson: string;
@@ -25,4 +25,3 @@ export declare function createGenerateSchemaTool({ datasetId, isNested, fileId,
     error?: undefined;
 }>;
 export {};
-//# sourceMappingURL=generateSchema.tool.d.ts.map

package/dist/file/generateSchema.tool.js CHANGED Viewed

@@ -1,20 +1,17 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.createGenerateSchemaTool = createGenerateSchemaTool;
-const ai_1 = require("ai");
-const zod_1 = require("zod");
-const steps_1 = require("../dataset/steps");
-function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
-    return (0, ai_1.tool)({
+import { tool } from "ai";
+import { z } from "zod";
+import { datasetUpdateSchemaStep } from "../dataset/steps.js";
+export function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }) {
+    return tool({
         description: `Generate a formal JSON schema for a SINGLE RECORD (row) from the file. This schema describes the structure of ONE record, not the entire dataset or array of records. Requirements:
 1. Schema describes ONE RECORD structure only (no array wrappers)
 2. All property names MUST use lowercaseCamelCase convention (e.g., 'productName', 'unitPrice')
 3. Each property MUST have a description field
 4. The schema description must explain what one record represents and field mappings from original file`,
-        inputSchema: zod_1.z.object({
-            schemaTitle: zod_1.z.string().describe("Title for the RECORD schema in PascalCase (e.g., 'ProductRecord', 'TransactionRecord')"),
-            schemaDescription: zod_1.z.string().describe("Comprehensive description that includes: 1) what ONE record represents, 2) its purpose, 3) complete field mapping from original file fields to schema fields with explanations (e.g., 'ARTÍCULO' -> 'articleCode': normalized to camelCase)"),
-            schemaJson: zod_1.z.string().describe("Complete JSON schema as string describing ONE RECORD. Must be type 'object' with properties. All properties must be in lowercaseCamelCase and have descriptions. Do NOT use type 'array' at root level."),
+        inputSchema: z.object({
+            schemaTitle: z.string().describe("Title for the RECORD schema in PascalCase (e.g., 'ProductRecord', 'TransactionRecord')"),
+            schemaDescription: z.string().describe("Comprehensive description that includes: 1) what ONE record represents, 2) its purpose, 3) complete field mapping from original file fields to schema fields with explanations (e.g., 'ARTÍCULO' -> 'articleCode': normalized to camelCase)"),
+            schemaJson: z.string().describe("Complete JSON schema as string describing ONE RECORD. Must be type 'object' with properties. All properties must be in lowercaseCamelCase and have descriptions. Do NOT use type 'array' at root level."),
         }),
         execute: async ({ schemaTitle, schemaDescription, schemaJson, }) => {
             console.log(`[Dataset ${datasetId}] ========================================`);
@@ -74,8 +71,8 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
                 console.log(`[Dataset ${datasetId}] Description: ${schemaDescription}`);
                 console.log(`[Dataset ${datasetId}] Schema JSON:`);
                 console.log(JSON.stringify(parsedSchema, null, 2));
-                const updateResult = await (0, steps_1.datasetUpdateSchemaStep)({
-                    env,
+                const updateResult = await datasetUpdateSchemaStep({
+                    runtime,
                     datasetId,
                     schema: schemaData,
                     status: "schema_complete",
@@ -107,4 +104,3 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
         },
     });
 }
-//# sourceMappingURL=generateSchema.tool.js.map

package/dist/file/index.d.ts CHANGED Viewed

@@ -1,2 +1 @@
-export * from "./file-dataset.agent";
-//# sourceMappingURL=index.d.ts.map
+export * from "./file-dataset.agent.js";

package/dist/file/index.js CHANGED Viewed

@@ -1,18 +1 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    var desc = Object.getOwnPropertyDescriptor(m, k);
-    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
-      desc = { enumerable: true, get: function() { return m[k]; } };
-    }
-    Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    o[k2] = m[k];
-}));
-var __exportStar = (this && this.__exportStar) || function(m, exports) {
-    for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-__exportStar(require("./file-dataset.agent"), exports);
-//# sourceMappingURL=index.js.map
+export * from "./file-dataset.agent.js";

package/dist/file/prompts.d.ts CHANGED Viewed

@@ -1,3 +1,2 @@
-import { FileParseStoryContext } from "./file-dataset.agent";
-export declare function buildFileDatasetPrompt(context: FileParseStoryContext): string;
-//# sourceMappingURL=prompts.d.ts.map
+import type { FileParseContext } from "./file-dataset.types.js";
+export declare function buildFileDatasetPrompt(context: FileParseContext): string;

package/dist/file/prompts.js CHANGED Viewed

@@ -1,26 +1,23 @@
-"use strict";
 // Plain build API using template literals and XML
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.buildFileDatasetPrompt = buildFileDatasetPrompt;
-const xmlbuilder2_1 = require("xmlbuilder2");
-const datasetFiles_1 = require("../datasetFiles");
+import { create } from "xmlbuilder2";
+import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
 function buildRole() {
-    let xml = (0, xmlbuilder2_1.create)()
+    let xml = create()
         .ele("Role")
         .txt("You are a dataset creator for a SINGLE file. Your goal is to convert the file content into a validated JSONL dataset where each line represents one record.")
         .up();
     return xml.end({ prettyPrint: true, headless: true });
 }
 function buildGoal() {
-    let xml = (0, xmlbuilder2_1.create)()
+    let xml = create()
         .ele("Goal")
-        .txt("Convert the source file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
+        .txt("Convert the input file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
         .up();
     return xml.end({ prettyPrint: true, headless: true });
 }
-function buildSourceInfo(context) {
-    let xml = (0, xmlbuilder2_1.create)()
-        .ele("Source")
+function buildResourceInfo(context) {
+    let xml = create()
+        .ele("FileResource")
         .ele("Type").txt("file").up()
         .ele("FileId").txt(context.fileId).up()
         .ele("DatasetId").txt(context.datasetId).up()
@@ -29,7 +26,7 @@ function buildSourceInfo(context) {
     return xml;
 }
 function buildFilePreviewSection(preview) {
-    let xml = (0, xmlbuilder2_1.create)()
+    let xml = create()
         .ele("FilePreview")
         .ele("TotalRows").txt(String(preview.totalRows)).up();
     if (preview.metadata) {
@@ -91,8 +88,9 @@ function buildErrorsSection(errors) {
     if (errors.length === 0) {
         return null;
     }
-    let xml = (0, xmlbuilder2_1.create)()
-        .ele("PreviousErrors");
+    let xml = create()
+        .ele("PreviousErrors")
+        .ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch input column names into schema keys piecemeal.").up();
     for (const error of errors) {
         xml = xml.ele("Error").txt(error).up();
     }
@@ -100,10 +98,10 @@ function buildErrorsSection(errors) {
     return xml;
 }
 function buildContextSection(context) {
-    let xml = (0, xmlbuilder2_1.create)()
+    let xml = create()
         .ele("Context");
-    const sourceXml = buildSourceInfo(context);
-    xml = xml.import(sourceXml.first());
+    const resourceXml = buildResourceInfo(context);
+    xml = xml.import(resourceXml.first());
     if (context.filePreview) {
         const previewXml = buildFilePreviewSection(context.filePreview);
         xml = xml.import(previewXml.first());
@@ -117,27 +115,123 @@ function buildContextSection(context) {
     xml = xml.up();
     return xml.end({ prettyPrint: true, headless: true });
 }
+function asRecord(value) {
+    return value && typeof value === "object" && !Array.isArray(value)
+        ? value
+        : null;
+}
+function getSchemaObject(context) {
+    return asRecord(context.schema?.schema);
+}
+function joinSchemaPath(basePath, key) {
+    return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
+}
+function collectSchemaContract(schema, path = "$", contract = {
+    requiredPaths: [],
+    propertyPaths: [],
+    enumConstraints: [],
+    closedObjectPaths: [],
+}) {
+    const record = asRecord(schema);
+    if (!record) {
+        return contract;
+    }
+    if (Array.isArray(record.enum)) {
+        contract.enumConstraints.push({
+            path,
+            values: record.enum.map((value) => JSON.stringify(value)),
+        });
+    }
+    const properties = asRecord(record.properties);
+    if (properties) {
+        if (record.additionalProperties === false) {
+            contract.closedObjectPaths.push(path);
+        }
+        const required = Array.isArray(record.required)
+            ? record.required.filter((value) => typeof value === "string")
+            : [];
+        for (const key of required) {
+            contract.requiredPaths.push(joinSchemaPath(path, key));
+        }
+        for (const [key, childSchema] of Object.entries(properties)) {
+            const childPath = joinSchemaPath(path, key);
+            contract.propertyPaths.push(childPath);
+            collectSchemaContract(childSchema, childPath, contract);
+        }
+    }
+    if (record.items) {
+        collectSchemaContract(record.items, `${path}[]`, contract);
+    }
+    for (const keyword of ["oneOf", "anyOf", "allOf"]) {
+        if (Array.isArray(record[keyword])) {
+            for (const childSchema of record[keyword]) {
+                collectSchemaContract(childSchema, path, contract);
+            }
+        }
+    }
+    return contract;
+}
+function appendLimitedList(xml, elementName, itemName, values, maxItems) {
+    let node = xml.ele(elementName);
+    for (const value of values.slice(0, maxItems)) {
+        node = node.ele(itemName).txt(value).up();
+    }
+    if (values.length > maxItems) {
+        node = node.ele("Truncated").txt(String(values.length - maxItems)).up();
+    }
+    return node.up();
+}
 function buildSchemaSection(context) {
-    if (!context.schema) {
+    const schema = getSchemaObject(context);
+    if (!context.schema || !schema) {
         return "";
     }
-    let xml = (0, xmlbuilder2_1.create)()
+    const contract = collectSchemaContract(schema);
+    let xml = create()
         .com("Schema section: This defines the structure of ONE RECORD (row). Each line in the JSONL output must conform to this schema.")
         .ele("Schema")
         .ele("Title").txt(context.schema.title || "").up()
-        .ele("Description").txt(context.schema.description || "").up()
-        .ele("JsonSchema").txt(JSON.stringify(context.schema.schema, null, 2)).up()
+        .ele("Description").txt(context.schema.description || "").up();
+    xml = xml
+        .ele("SchemaContract")
+        .ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
+        .ele("Rule").txt("Use only schema property keys in data objects. Input headers are input labels, not output keys.").up()
+        .ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
+        .ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize input labels to the closest valid enum literal; never emit a value outside the enum.").up();
+    xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
+    xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
+    let enumsXml = xml.ele("EnumConstraints");
+    for (const constraint of contract.enumConstraints.slice(0, 80)) {
+        let enumXml = enumsXml.ele("Enum", { path: constraint.path });
+        for (const value of constraint.values.slice(0, 80)) {
+            enumXml = enumXml.ele("Value").txt(value).up();
+        }
+        if (constraint.values.length > 80) {
+            enumXml = enumXml.ele("Truncated").txt(String(constraint.values.length - 80)).up();
+        }
+        enumsXml = enumXml.up();
+    }
+    if (contract.enumConstraints.length > 80) {
+        enumsXml = enumsXml.ele("Truncated").txt(String(contract.enumConstraints.length - 80)).up();
+    }
+    xml = enumsXml.up();
+    xml = appendLimitedList(xml, "ClosedObjectPaths", "Path", contract.closedObjectPaths, 80);
+    xml = xml
+        .up()
+        .ele("JsonSchema").txt(JSON.stringify(schema, null, 2)).up()
         .up();
     return xml.end({ prettyPrint: true, headless: true });
 }
 function buildInstructions(context) {
-    const datasetWorkstation = (0, datasetFiles_1.getDatasetWorkstation)(context.datasetId);
-    const outputPath = (0, datasetFiles_1.getDatasetOutputPath)(context.datasetId);
+    const datasetWorkstation = context.sandboxConfig.scriptsDir
+        ? context.sandboxConfig.scriptsDir.replace(/\/scripts$/, "")
+        : getDatasetWorkstation(context.datasetId);
+    const outputPath = context.sandboxConfig.outputPath ?? getDatasetOutputPath(context.datasetId);
     const hasProvidedSchema = Boolean(context.schema?.schema);
     const currentTask = hasProvidedSchema
         ? "Review FilePreview section, use the provided schema as the output contract, then parse the file and generate the dataset"
         : "Review FilePreview section to understand file structure, then generate JSON Schema for a SINGLE RECORD, then parse the file and generate the dataset";
-    let xml = (0, xmlbuilder2_1.create)()
+    let xml = create()
         .ele("Instructions")
         .ele("Workflow")
         .ele("Step", { number: "1", name: "Inspect File" })
@@ -150,6 +244,11 @@ function buildInstructions(context) {
             .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
             .ele("Requirements")
             .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
+            .ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
+            .ele("Requirement").txt("Build a schema-first mapping from input columns to schema fields before writing output.jsonl. Do not use raw input headers as JSON keys unless they are exactly schema keys").up()
+            .ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
+            .ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
+            .ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the input language").up()
             .ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
             .up()
             .up();
@@ -173,6 +272,8 @@ function buildInstructions(context) {
         .ele("Requirements")
         .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
         .ele("Requirement").txt("Output JSONL format: each line is {\"type\": \"row\", \"data\": {...record...}}").up()
+        .ele("Requirement").txt("When a schema is provided, each data object must contain the exact required schema keys and must not use translated or synonymous keys").up()
+        .ele("Requirement").txt("When validation returns zero valid rows, treat the previous output as structurally wrong and rewrite output.jsonl from the SchemaContract, not by applying small patches").up()
         .ele("Requirement").txt("Extract ONLY data records; skip any header lines, summary sections, or file metadata").up()
         .ele("Requirement").txt(`Save output to: ${outputPath}`).up()
         .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
@@ -180,11 +281,13 @@ function buildInstructions(context) {
         .up()
         .ele("Step", { number: "4", name: "Complete and Validate" })
         .ele("Action").txt("Call completeDataset to validate the dataset").up()
-        .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns error details if validation fails.").up()
+        .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
         .up()
         .up()
         .ele("Rules")
         .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
+        .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
+        .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
         .ele("Rule").txt("Datasets contain ONLY data records; exclude all header sections and file metadata").up()
         .ele("Rule").txt("JSONL format: each line = separate JSON object representing one data record").up()
         .ele("Rule").txt("FilePreview shows raw file content - use Script to understand data extraction").up()
@@ -197,7 +300,7 @@ function buildInstructions(context) {
         .up();
     return xml.end({ prettyPrint: true, headless: true });
 }
-function buildFileDatasetPrompt(context) {
+export function buildFileDatasetPrompt(context) {
     const sections = [];
     sections.push(buildRole());
     sections.push("");
@@ -205,7 +308,11 @@ function buildFileDatasetPrompt(context) {
     sections.push("");
     sections.push(buildContextSection(context));
     sections.push("");
+    const schemaSection = buildSchemaSection(context);
+    if (schemaSection) {
+        sections.push(schemaSection);
+        sections.push("");
+    }
     sections.push(buildInstructions(context));
     return sections.join("\n");
 }
-//# sourceMappingURL=prompts.js.map

package/dist/file/scripts.generated.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare const PYTHON_SCRIPT_BASE64_BY_NAME: Readonly<Record<string, string>>;