@ekairos/dataset 1.22.36-beta.development.0 → 1.22.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/README.md +347 -0
  2. package/dist/agents.d.ts +8 -0
  3. package/dist/agents.js +8 -0
  4. package/dist/builder/agentMaterializers.d.ts +9 -0
  5. package/dist/builder/agentMaterializers.js +10 -0
  6. package/dist/builder/context.d.ts +15 -0
  7. package/dist/builder/context.js +251 -0
  8. package/dist/builder/instructions.d.ts +5 -0
  9. package/dist/builder/instructions.js +40 -0
  10. package/dist/builder/materialize.d.ts +83 -0
  11. package/dist/builder/materialize.js +548 -0
  12. package/dist/builder/materializeQuery.d.ts +12 -0
  13. package/dist/builder/materializeQuery.js +31 -0
  14. package/dist/builder/persistence.d.ts +22 -0
  15. package/dist/builder/persistence.js +153 -0
  16. package/dist/builder/rows.d.ts +7 -0
  17. package/dist/builder/rows.js +56 -0
  18. package/dist/builder/schemaInference.d.ts +3 -0
  19. package/dist/builder/schemaInference.js +61 -0
  20. package/dist/builder/types.d.ts +140 -0
  21. package/dist/builder/types.js +1 -0
  22. package/dist/clearDataset.tool.d.ts +2 -3
  23. package/dist/clearDataset.tool.js +13 -17
  24. package/dist/completeDataset.steps.d.ts +117 -0
  25. package/dist/completeDataset.steps.js +487 -0
  26. package/dist/completeDataset.tool.d.ts +132 -7
  27. package/dist/completeDataset.tool.js +46 -192
  28. package/dist/contextResources.d.ts +31 -0
  29. package/dist/contextResources.js +151 -0
  30. package/dist/contextWorkspace.d.ts +79 -0
  31. package/dist/contextWorkspace.js +234 -0
  32. package/dist/dataset/steps.d.ts +39 -15
  33. package/dist/dataset/steps.js +96 -39
  34. package/dist/dataset.d.ts +3 -67
  35. package/dist/dataset.js +129 -521
  36. package/dist/datasetFiles.d.ts +5 -1
  37. package/dist/datasetFiles.js +29 -27
  38. package/dist/domain.d.ts +1 -2
  39. package/dist/domain.js +1 -6
  40. package/dist/executeCommand.tool.d.ts +2 -30
  41. package/dist/executeCommand.tool.js +165 -39
  42. package/dist/file/file-dataset.agent.d.ts +19 -56
  43. package/dist/file/file-dataset.agent.js +176 -134
  44. package/dist/file/file-dataset.steps.d.ts +27 -0
  45. package/dist/file/file-dataset.steps.js +47 -0
  46. package/dist/file/file-dataset.types.d.ts +64 -0
  47. package/dist/file/file-dataset.types.js +1 -0
  48. package/dist/file/filepreview.d.ts +5 -35
  49. package/dist/file/filepreview.js +60 -107
  50. package/dist/file/filepreview.types.d.ts +31 -0
  51. package/dist/file/filepreview.types.js +1 -0
  52. package/dist/file/generateSchema.tool.d.ts +2 -3
  53. package/dist/file/generateSchema.tool.js +11 -15
  54. package/dist/file/index.d.ts +1 -2
  55. package/dist/file/index.js +1 -18
  56. package/dist/file/prompts.d.ts +2 -3
  57. package/dist/file/prompts.js +134 -27
  58. package/dist/file/scripts.generated.d.ts +1 -0
  59. package/dist/file/scripts.generated.js +11 -0
  60. package/dist/file/steps.d.ts +1 -2
  61. package/dist/file/steps.js +9 -7
  62. package/dist/id.d.ts +1 -0
  63. package/dist/id.js +10 -0
  64. package/dist/index.d.ts +8 -7
  65. package/dist/index.js +8 -23
  66. package/dist/materializeDataset.tool.d.ts +52 -32
  67. package/dist/materializeDataset.tool.js +81 -65
  68. package/dist/query/index.d.ts +1 -2
  69. package/dist/query/index.js +1 -18
  70. package/dist/query/queryDomain.d.ts +3 -4
  71. package/dist/query/queryDomain.js +3 -40
  72. package/dist/query/queryDomain.step.d.ts +1 -1
  73. package/dist/query/queryDomain.step.js +13 -13
  74. package/dist/sandbox/steps.d.ts +23 -15
  75. package/dist/sandbox/steps.js +73 -76
  76. package/dist/sandbox.steps.d.ts +1 -2
  77. package/dist/sandbox.steps.js +1 -18
  78. package/dist/schema.d.ts +13 -13
  79. package/dist/schema.js +25 -37
  80. package/dist/service.d.ts +8 -5
  81. package/dist/service.js +70 -15
  82. package/dist/skill.d.ts +0 -1
  83. package/dist/skill.js +12 -17
  84. package/dist/transform/filepreview.d.ts +2 -3
  85. package/dist/transform/filepreview.js +9 -26
  86. package/dist/transform/index.d.ts +2 -3
  87. package/dist/transform/index.js +2 -8
  88. package/dist/transform/prompts.d.ts +1 -34
  89. package/dist/transform/prompts.js +58 -43
  90. package/dist/transform/transform-dataset.agent.d.ts +20 -45
  91. package/dist/transform/transform-dataset.agent.js +146 -91
  92. package/dist/transform/transform-dataset.steps.d.ts +30 -0
  93. package/dist/transform/transform-dataset.steps.js +61 -0
  94. package/dist/transform/transform-dataset.types.d.ts +95 -0
  95. package/dist/transform/transform-dataset.types.js +1 -0
  96. package/dist/transform/transformDataset.d.ts +3 -3
  97. package/dist/transform/transformDataset.js +15 -18
  98. package/dist/writeDatasetRows.tool.d.ts +188 -0
  99. package/dist/writeDatasetRows.tool.js +258 -0
  100. package/package.json +35 -10
  101. package/dist/clearDataset.tool.d.ts.map +0 -1
  102. package/dist/clearDataset.tool.js.map +0 -1
  103. package/dist/completeDataset.tool.d.ts.map +0 -1
  104. package/dist/completeDataset.tool.js.map +0 -1
  105. package/dist/dataset/steps.d.ts.map +0 -1
  106. package/dist/dataset/steps.js.map +0 -1
  107. package/dist/dataset.d.ts.map +0 -1
  108. package/dist/dataset.js.map +0 -1
  109. package/dist/datasetFiles.d.ts.map +0 -1
  110. package/dist/datasetFiles.js.map +0 -1
  111. package/dist/domain.d.ts.map +0 -1
  112. package/dist/domain.js.map +0 -1
  113. package/dist/eventsReactRuntime.d.ts +0 -22
  114. package/dist/eventsReactRuntime.d.ts.map +0 -1
  115. package/dist/eventsReactRuntime.js +0 -29
  116. package/dist/eventsReactRuntime.js.map +0 -1
  117. package/dist/executeCommand.tool.d.ts.map +0 -1
  118. package/dist/executeCommand.tool.js.map +0 -1
  119. package/dist/file/file-dataset.agent.d.ts.map +0 -1
  120. package/dist/file/file-dataset.agent.js.map +0 -1
  121. package/dist/file/filepreview.d.ts.map +0 -1
  122. package/dist/file/filepreview.js.map +0 -1
  123. package/dist/file/generateSchema.tool.d.ts.map +0 -1
  124. package/dist/file/generateSchema.tool.js.map +0 -1
  125. package/dist/file/index.d.ts.map +0 -1
  126. package/dist/file/index.js.map +0 -1
  127. package/dist/file/prompts.d.ts.map +0 -1
  128. package/dist/file/prompts.js.map +0 -1
  129. package/dist/file/steps.d.ts.map +0 -1
  130. package/dist/file/steps.js.map +0 -1
  131. package/dist/index.d.ts.map +0 -1
  132. package/dist/index.js.map +0 -1
  133. package/dist/materializeDataset.tool.d.ts.map +0 -1
  134. package/dist/materializeDataset.tool.js.map +0 -1
  135. package/dist/query/index.d.ts.map +0 -1
  136. package/dist/query/index.js.map +0 -1
  137. package/dist/query/queryDomain.d.ts.map +0 -1
  138. package/dist/query/queryDomain.js.map +0 -1
  139. package/dist/query/queryDomain.step.d.ts.map +0 -1
  140. package/dist/query/queryDomain.step.js.map +0 -1
  141. package/dist/sandbox/steps.d.ts.map +0 -1
  142. package/dist/sandbox/steps.js.map +0 -1
  143. package/dist/sandbox.steps.d.ts.map +0 -1
  144. package/dist/sandbox.steps.js.map +0 -1
  145. package/dist/schema.d.ts.map +0 -1
  146. package/dist/schema.js.map +0 -1
  147. package/dist/service.d.ts.map +0 -1
  148. package/dist/service.js.map +0 -1
  149. package/dist/skill.d.ts.map +0 -1
  150. package/dist/skill.js.map +0 -1
  151. package/dist/transform/filepreview.d.ts.map +0 -1
  152. package/dist/transform/filepreview.js.map +0 -1
  153. package/dist/transform/index.d.ts.map +0 -1
  154. package/dist/transform/index.js.map +0 -1
  155. package/dist/transform/prompts.d.ts.map +0 -1
  156. package/dist/transform/prompts.js.map +0 -1
  157. package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
  158. package/dist/transform/transform-dataset.agent.js.map +0 -1
  159. package/dist/transform/transformDataset.d.ts.map +0 -1
  160. package/dist/transform/transformDataset.js.map +0 -1
@@ -1,30 +1,38 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.ensurePreviewScriptsAvailable = ensurePreviewScriptsAvailable;
4
- exports.generateFilePreview = generateFilePreview;
5
- const fs_1 = require("fs");
6
- const path_1 = require("path");
7
- const steps_1 = require("../sandbox/steps");
1
+ import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
2
+ import { PYTHON_SCRIPT_BASE64_BY_NAME } from "./scripts.generated.js";
8
3
  const DEFAULT_HEAD_LINES = 50;
9
4
  const DEFAULT_TAIL_LINES = 20;
10
5
  const DEFAULT_MID_LINES = 20;
11
- const SANDBOX_SCRIPT_DIRECTORY = "/tmp/ekairos/dataset/file/scripts";
12
- const PYTHON_SCRIPT_FILES = [
13
- "file_metadata.py",
14
- "preview_head_csv.py",
15
- "preview_head_excel.py",
16
- "preview_mid_csv.py",
17
- "preview_mid_excel.py",
18
- "preview_tail_csv.py",
19
- "preview_tail_excel.py",
20
- ];
21
- function resolveScriptPath(scriptName) {
22
- // Prefer local scripts in src/ (tests/dev), and after build the scripts are copied to dist/
23
- // at the same relative path, so this works in both environments.
24
- return (0, path_1.join)(__dirname, "scripts", scriptName);
6
+ export function getEmbeddedFilePreviewScriptBase64(scriptName) {
7
+ const embedded = PYTHON_SCRIPT_BASE64_BY_NAME[scriptName];
8
+ if (!embedded) {
9
+ throw new Error(`dataset_preview_script_not_embedded:${scriptName}`);
10
+ }
11
+ return embedded;
12
+ }
13
+ function readFilePreviewScriptText(scriptName) {
14
+ return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
15
+ }
16
+ function sanitizePreviewText(value) {
17
+ return String(value ?? "")
18
+ .replace(/\u0000/g, "")
19
+ .replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "");
20
+ }
21
+ function getPreviewKind(extension) {
22
+ const normalized = extension.toLowerCase();
23
+ if (normalized === ".xlsx" || normalized === ".xls")
24
+ return "excel";
25
+ if (normalized === ".csv" ||
26
+ normalized === ".tsv" ||
27
+ normalized === ".txt" ||
28
+ normalized === ".log" ||
29
+ normalized === ".json" ||
30
+ normalized === ".jsonl" ||
31
+ normalized === ".md") {
32
+ return "text";
33
+ }
34
+ return null;
25
35
  }
26
- const preparedSandboxIds = new Set();
27
- const sandboxSetupPromises = new Map();
28
36
  function validateScriptResult(result, context) {
29
37
  if (!result.stderr) {
30
38
  return;
@@ -37,75 +45,24 @@ function validateScriptResult(result, context) {
37
45
  throw new Error(`${context} failed: ${stderr.substring(0, 500)}`);
38
46
  }
39
47
  }
40
- async function ensurePreviewScriptsAvailable(env, sandboxId) {
41
- if (preparedSandboxIds.has(sandboxId)) {
42
- return;
43
- }
44
- const inFlight = sandboxSetupPromises.get(sandboxId);
45
- if (inFlight) {
46
- await inFlight;
47
- return;
48
- }
49
- const setupPromise = (async () => {
50
- try {
51
- await (0, steps_1.runDatasetSandboxCommandStep)({
52
- env,
53
- sandboxId,
54
- cmd: "mkdir",
55
- args: ["-p", SANDBOX_SCRIPT_DIRECTORY],
56
- });
57
- }
58
- catch (error) {
59
- console.warn("[Dataset Scripts] Failed to create sandbox scripts directory", error);
60
- }
61
- const filesToWrite = [];
62
- for (const scriptName of PYTHON_SCRIPT_FILES) {
63
- try {
64
- const scriptPath = resolveScriptPath(scriptName);
65
- const fileBuffer = (0, fs_1.readFileSync)(scriptPath);
66
- filesToWrite.push({
67
- path: `${SANDBOX_SCRIPT_DIRECTORY}/${scriptName}`,
68
- contentBase64: Buffer.from(fileBuffer).toString("base64"),
69
- });
70
- }
71
- catch (error) {
72
- console.error(`[Dataset Scripts] Failed to read script ${scriptName}`, error);
73
- throw error;
74
- }
75
- }
76
- if (filesToWrite.length > 0) {
77
- await (0, steps_1.writeDatasetSandboxFilesStep)({
78
- env,
79
- sandboxId,
80
- files: filesToWrite,
81
- });
82
- }
83
- })();
84
- sandboxSetupPromises.set(sandboxId, setupPromise);
85
- try {
86
- await setupPromise;
87
- preparedSandboxIds.add(sandboxId);
88
- }
89
- catch (error) {
90
- sandboxSetupPromises.delete(sandboxId);
91
- throw error;
92
- }
48
+ export async function ensurePreviewScriptsAvailable(_runtime, _sandboxId) {
49
+ return;
93
50
  }
94
- async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, options = {}) {
51
+ export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId, options = {}) {
95
52
  const context = {
96
53
  totalRows: 0,
97
54
  };
98
55
  try {
99
- await ensurePreviewScriptsAvailable(env, sandboxId);
100
- const metadataResult = await runScript(env, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
56
+ const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
57
+ validateScriptResult(metadataResult, `preview_metadata for ${datasetId}`);
101
58
  context.metadata = metadataResult;
102
- let isExcel = false;
59
+ let previewKind = null;
103
60
  if (metadataResult.stdout) {
104
61
  try {
105
62
  const metadataJson = JSON.parse(metadataResult.stdout);
106
63
  context.totalRows = metadataJson.row_count_estimate || 0;
107
64
  const extension = metadataJson.extension || "";
108
- isExcel = extension === ".xlsx" || extension === ".xls";
65
+ previewKind = getPreviewKind(extension);
109
66
  }
110
67
  catch {
111
68
  console.warn(`[Dataset ${datasetId}] Failed to parse metadata JSON`);
@@ -118,28 +75,32 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
118
75
  console.log(`[Dataset ${datasetId}] No rows detected, skipping preview`);
119
76
  return context;
120
77
  }
121
- const headScript = isExcel ? "preview_head_excel.py" : "preview_head_csv.py";
122
- const tailScript = isExcel ? "preview_tail_excel.py" : "preview_tail_csv.py";
123
- const midScript = isExcel ? "preview_mid_excel.py" : "preview_mid_csv.py";
78
+ if (!previewKind) {
79
+ console.log(`[Dataset ${datasetId}] Binary or unsupported preview format, keeping metadata only`);
80
+ return context;
81
+ }
82
+ const headScript = previewKind === "excel" ? "preview_head_excel.py" : "preview_head_csv.py";
83
+ const tailScript = previewKind === "excel" ? "preview_tail_excel.py" : "preview_tail_csv.py";
84
+ const midScript = previewKind === "excel" ? "preview_mid_excel.py" : "preview_mid_csv.py";
124
85
  if (totalRows <= headLines) {
125
86
  console.log(`[Dataset ${datasetId}] File has ${totalRows} rows, reading all with head only`);
126
- const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
87
+ const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
127
88
  validateScriptResult(headResult, `preview_head for ${datasetId}`);
128
89
  context.head = headResult;
129
90
  return context;
130
91
  }
131
92
  if (headLines + tailLines >= totalRows) {
132
93
  console.log(`[Dataset ${datasetId}] Head + tail would cover entire file (${totalRows} rows), reading all with head only`);
133
- const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
94
+ const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
134
95
  validateScriptResult(headResult, `preview_head for ${datasetId}`);
135
96
  context.head = headResult;
136
97
  return context;
137
98
  }
138
99
  console.log(`[Dataset ${datasetId}] Reading head (${headLines} rows) and tail (${tailLines} rows) from ${totalRows} total rows`);
139
- const headResult = await runScript(env, sandboxId, headScript, [sandboxFilePath, String(headLines)], `Reads the first ${headLines} rows of the file`);
100
+ const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(headLines)], `Reads the first ${headLines} rows of the file`);
140
101
  validateScriptResult(headResult, `preview_head for ${datasetId}`);
141
102
  context.head = headResult;
142
- const tailResult = await runScript(env, sandboxId, tailScript, [sandboxFilePath, String(tailLines)], `Reads the last ${tailLines} rows of the file`);
103
+ const tailResult = await runScript(runtime, sandboxId, tailScript, [sandboxFilePath, String(tailLines)], `Reads the last ${tailLines} rows of the file`);
143
104
  validateScriptResult(tailResult, `preview_tail for ${datasetId}`);
144
105
  context.tail = tailResult;
145
106
  const midLines = options.midLines || DEFAULT_MID_LINES;
@@ -148,40 +109,33 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
148
109
  const midStart = headLines;
149
110
  const midEnd = totalRows - tailLines;
150
111
  console.log(`[Dataset ${datasetId}] Large gap (${gapSize} rows), adding mid sample (${midLines} rows)`);
151
- const midResult = await runScript(env, sandboxId, midScript, [sandboxFilePath, String(midStart), String(midEnd), String(midLines)], `Samples ${midLines} rows from the middle section (rows ${midStart + 1} to ${midEnd})`);
112
+ const midResult = await runScript(runtime, sandboxId, midScript, [sandboxFilePath, String(midStart), String(midEnd), String(midLines)], `Samples ${midLines} rows from the middle section (rows ${midStart + 1} to ${midEnd})`);
152
113
  validateScriptResult(midResult, `preview_mid for ${datasetId}`);
153
114
  context.mid = midResult;
154
115
  }
155
116
  }
156
117
  catch (error) {
157
118
  console.error(`[Dataset ${datasetId}] Error generating file preview:`, error);
119
+ throw error;
158
120
  }
159
121
  return context;
160
122
  }
161
- async function runScript(env, sandboxId, scriptName, args, description) {
162
- const scriptPath = `/vercel/sandbox/lib/domain/dataset/file/scripts/${scriptName}`;
163
- const command = `python ${scriptPath} ${args.join(" ")}`;
164
- let scriptContent = "";
165
- try {
166
- const localScriptPath = resolveScriptPath(scriptName);
167
- scriptContent = (0, fs_1.readFileSync)(localScriptPath, 'utf-8');
168
- }
169
- catch (error) {
170
- console.warn(`Failed to read script ${scriptName}:`, error);
171
- }
123
+ async function runScript(runtime, sandboxId, scriptName, args, description) {
124
+ const scriptContent = readFilePreviewScriptText(scriptName);
125
+ const command = `python -c <${scriptName}> ${args.join(" ")}`;
172
126
  try {
173
- const result = await (0, steps_1.runDatasetSandboxCommandStep)({
174
- env,
127
+ const result = await runDatasetSandboxCommandStep({
128
+ runtime,
175
129
  sandboxId,
176
130
  cmd: "python",
177
- args: [scriptPath, ...args],
131
+ args: ["-c", scriptContent, ...args],
178
132
  });
179
133
  return {
180
134
  description,
181
135
  script: scriptContent,
182
136
  command,
183
- stdout: result.stdout || "",
184
- stderr: result.stderr || "",
137
+ stdout: sanitizePreviewText(result.stdout),
138
+ stderr: sanitizePreviewText(result.stderr),
185
139
  };
186
140
  }
187
141
  catch (error) {
@@ -190,8 +144,7 @@ async function runScript(env, sandboxId, scriptName, args, description) {
190
144
  script: scriptContent,
191
145
  command,
192
146
  stdout: "",
193
- stderr: error instanceof Error ? error.message : String(error),
147
+ stderr: sanitizePreviewText(error instanceof Error ? error.message : String(error)),
194
148
  };
195
149
  }
196
150
  }
197
- //# sourceMappingURL=filepreview.js.map
@@ -0,0 +1,31 @@
1
+ export type FilePreviewContext = {
2
+ totalRows: number;
3
+ metadata?: {
4
+ description: string;
5
+ script: string;
6
+ command: string;
7
+ stdout: string;
8
+ stderr: string;
9
+ };
10
+ head?: {
11
+ description: string;
12
+ script: string;
13
+ command: string;
14
+ stdout: string;
15
+ stderr: string;
16
+ };
17
+ tail?: {
18
+ description: string;
19
+ script: string;
20
+ command: string;
21
+ stdout: string;
22
+ stderr: string;
23
+ };
24
+ mid?: {
25
+ description: string;
26
+ script: string;
27
+ command: string;
28
+ stdout: string;
29
+ stderr: string;
30
+ };
31
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -2,9 +2,9 @@ interface GenerateSchemaToolParams {
2
2
  datasetId: string;
3
3
  isNested?: boolean;
4
4
  fileId?: string;
5
- env: any;
5
+ runtime: any;
6
6
  }
7
- export declare function createGenerateSchemaTool({ datasetId, isNested, fileId, env }: GenerateSchemaToolParams): import("ai").Tool<{
7
+ export declare function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }: GenerateSchemaToolParams): import("ai").Tool<{
8
8
  schemaTitle: string;
9
9
  schemaDescription: string;
10
10
  schemaJson: string;
@@ -25,4 +25,3 @@ export declare function createGenerateSchemaTool({ datasetId, isNested, fileId,
25
25
  error?: undefined;
26
26
  }>;
27
27
  export {};
28
- //# sourceMappingURL=generateSchema.tool.d.ts.map
@@ -1,20 +1,17 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.createGenerateSchemaTool = createGenerateSchemaTool;
4
- const ai_1 = require("ai");
5
- const zod_1 = require("zod");
6
- const steps_1 = require("../dataset/steps");
7
- function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
8
- return (0, ai_1.tool)({
1
+ import { tool } from "ai";
2
+ import { z } from "zod";
3
+ import { datasetUpdateSchemaStep } from "../dataset/steps.js";
4
+ export function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }) {
5
+ return tool({
9
6
  description: `Generate a formal JSON schema for a SINGLE RECORD (row) from the file. This schema describes the structure of ONE record, not the entire dataset or array of records. Requirements:
10
7
  1. Schema describes ONE RECORD structure only (no array wrappers)
11
8
  2. All property names MUST use lowercaseCamelCase convention (e.g., 'productName', 'unitPrice')
12
9
  3. Each property MUST have a description field
13
10
  4. The schema description must explain what one record represents and field mappings from original file`,
14
- inputSchema: zod_1.z.object({
15
- schemaTitle: zod_1.z.string().describe("Title for the RECORD schema in PascalCase (e.g., 'ProductRecord', 'TransactionRecord')"),
16
- schemaDescription: zod_1.z.string().describe("Comprehensive description that includes: 1) what ONE record represents, 2) its purpose, 3) complete field mapping from original file fields to schema fields with explanations (e.g., 'ARTÍCULO' -> 'articleCode': normalized to camelCase)"),
17
- schemaJson: zod_1.z.string().describe("Complete JSON schema as string describing ONE RECORD. Must be type 'object' with properties. All properties must be in lowercaseCamelCase and have descriptions. Do NOT use type 'array' at root level."),
11
+ inputSchema: z.object({
12
+ schemaTitle: z.string().describe("Title for the RECORD schema in PascalCase (e.g., 'ProductRecord', 'TransactionRecord')"),
13
+ schemaDescription: z.string().describe("Comprehensive description that includes: 1) what ONE record represents, 2) its purpose, 3) complete field mapping from original file fields to schema fields with explanations (e.g., 'ARTÍCULO' -> 'articleCode': normalized to camelCase)"),
14
+ schemaJson: z.string().describe("Complete JSON schema as string describing ONE RECORD. Must be type 'object' with properties. All properties must be in lowercaseCamelCase and have descriptions. Do NOT use type 'array' at root level."),
18
15
  }),
19
16
  execute: async ({ schemaTitle, schemaDescription, schemaJson, }) => {
20
17
  console.log(`[Dataset ${datasetId}] ========================================`);
@@ -74,8 +71,8 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
74
71
  console.log(`[Dataset ${datasetId}] Description: ${schemaDescription}`);
75
72
  console.log(`[Dataset ${datasetId}] Schema JSON:`);
76
73
  console.log(JSON.stringify(parsedSchema, null, 2));
77
- const updateResult = await (0, steps_1.datasetUpdateSchemaStep)({
78
- env,
74
+ const updateResult = await datasetUpdateSchemaStep({
75
+ runtime,
79
76
  datasetId,
80
77
  schema: schemaData,
81
78
  status: "schema_complete",
@@ -107,4 +104,3 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
107
104
  },
108
105
  });
109
106
  }
110
- //# sourceMappingURL=generateSchema.tool.js.map
@@ -1,2 +1 @@
1
- export * from "./file-dataset.agent";
2
- //# sourceMappingURL=index.d.ts.map
1
+ export * from "./file-dataset.agent.js";
@@ -1,18 +1 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
- for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
- };
16
- Object.defineProperty(exports, "__esModule", { value: true });
17
- __exportStar(require("./file-dataset.agent"), exports);
18
- //# sourceMappingURL=index.js.map
1
+ export * from "./file-dataset.agent.js";
@@ -1,3 +1,2 @@
1
- import { FileParseStoryContext } from "./file-dataset.agent";
2
- export declare function buildFileDatasetPrompt(context: FileParseStoryContext): string;
3
- //# sourceMappingURL=prompts.d.ts.map
1
+ import type { FileParseContext } from "./file-dataset.types.js";
2
+ export declare function buildFileDatasetPrompt(context: FileParseContext): string;
@@ -1,26 +1,23 @@
1
- "use strict";
2
1
  // Plain build API using template literals and XML
3
- Object.defineProperty(exports, "__esModule", { value: true });
4
- exports.buildFileDatasetPrompt = buildFileDatasetPrompt;
5
- const xmlbuilder2_1 = require("xmlbuilder2");
6
- const datasetFiles_1 = require("../datasetFiles");
2
+ import { create } from "xmlbuilder2";
3
+ import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
7
4
  function buildRole() {
8
- let xml = (0, xmlbuilder2_1.create)()
5
+ let xml = create()
9
6
  .ele("Role")
10
7
  .txt("You are a dataset creator for a SINGLE file. Your goal is to convert the file content into a validated JSONL dataset where each line represents one record.")
11
8
  .up();
12
9
  return xml.end({ prettyPrint: true, headless: true });
13
10
  }
14
11
  function buildGoal() {
15
- let xml = (0, xmlbuilder2_1.create)()
12
+ let xml = create()
16
13
  .ele("Goal")
17
- .txt("Convert the source file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
14
+ .txt("Convert the input file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
18
15
  .up();
19
16
  return xml.end({ prettyPrint: true, headless: true });
20
17
  }
21
- function buildSourceInfo(context) {
22
- let xml = (0, xmlbuilder2_1.create)()
23
- .ele("Source")
18
+ function buildResourceInfo(context) {
19
+ let xml = create()
20
+ .ele("FileResource")
24
21
  .ele("Type").txt("file").up()
25
22
  .ele("FileId").txt(context.fileId).up()
26
23
  .ele("DatasetId").txt(context.datasetId).up()
@@ -29,7 +26,7 @@ function buildSourceInfo(context) {
29
26
  return xml;
30
27
  }
31
28
  function buildFilePreviewSection(preview) {
32
- let xml = (0, xmlbuilder2_1.create)()
29
+ let xml = create()
33
30
  .ele("FilePreview")
34
31
  .ele("TotalRows").txt(String(preview.totalRows)).up();
35
32
  if (preview.metadata) {
@@ -91,8 +88,9 @@ function buildErrorsSection(errors) {
91
88
  if (errors.length === 0) {
92
89
  return null;
93
90
  }
94
- let xml = (0, xmlbuilder2_1.create)()
95
- .ele("PreviousErrors");
91
+ let xml = create()
92
+ .ele("PreviousErrors")
93
+ .ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch input column names into schema keys piecemeal.").up();
96
94
  for (const error of errors) {
97
95
  xml = xml.ele("Error").txt(error).up();
98
96
  }
@@ -100,10 +98,10 @@ function buildErrorsSection(errors) {
100
98
  return xml;
101
99
  }
102
100
  function buildContextSection(context) {
103
- let xml = (0, xmlbuilder2_1.create)()
101
+ let xml = create()
104
102
  .ele("Context");
105
- const sourceXml = buildSourceInfo(context);
106
- xml = xml.import(sourceXml.first());
103
+ const resourceXml = buildResourceInfo(context);
104
+ xml = xml.import(resourceXml.first());
107
105
  if (context.filePreview) {
108
106
  const previewXml = buildFilePreviewSection(context.filePreview);
109
107
  xml = xml.import(previewXml.first());
@@ -117,27 +115,123 @@ function buildContextSection(context) {
117
115
  xml = xml.up();
118
116
  return xml.end({ prettyPrint: true, headless: true });
119
117
  }
118
+ function asRecord(value) {
119
+ return value && typeof value === "object" && !Array.isArray(value)
120
+ ? value
121
+ : null;
122
+ }
123
+ function getSchemaObject(context) {
124
+ return asRecord(context.schema?.schema);
125
+ }
126
+ function joinSchemaPath(basePath, key) {
127
+ return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
128
+ }
129
+ function collectSchemaContract(schema, path = "$", contract = {
130
+ requiredPaths: [],
131
+ propertyPaths: [],
132
+ enumConstraints: [],
133
+ closedObjectPaths: [],
134
+ }) {
135
+ const record = asRecord(schema);
136
+ if (!record) {
137
+ return contract;
138
+ }
139
+ if (Array.isArray(record.enum)) {
140
+ contract.enumConstraints.push({
141
+ path,
142
+ values: record.enum.map((value) => JSON.stringify(value)),
143
+ });
144
+ }
145
+ const properties = asRecord(record.properties);
146
+ if (properties) {
147
+ if (record.additionalProperties === false) {
148
+ contract.closedObjectPaths.push(path);
149
+ }
150
+ const required = Array.isArray(record.required)
151
+ ? record.required.filter((value) => typeof value === "string")
152
+ : [];
153
+ for (const key of required) {
154
+ contract.requiredPaths.push(joinSchemaPath(path, key));
155
+ }
156
+ for (const [key, childSchema] of Object.entries(properties)) {
157
+ const childPath = joinSchemaPath(path, key);
158
+ contract.propertyPaths.push(childPath);
159
+ collectSchemaContract(childSchema, childPath, contract);
160
+ }
161
+ }
162
+ if (record.items) {
163
+ collectSchemaContract(record.items, `${path}[]`, contract);
164
+ }
165
+ for (const keyword of ["oneOf", "anyOf", "allOf"]) {
166
+ if (Array.isArray(record[keyword])) {
167
+ for (const childSchema of record[keyword]) {
168
+ collectSchemaContract(childSchema, path, contract);
169
+ }
170
+ }
171
+ }
172
+ return contract;
173
+ }
174
+ function appendLimitedList(xml, elementName, itemName, values, maxItems) {
175
+ let node = xml.ele(elementName);
176
+ for (const value of values.slice(0, maxItems)) {
177
+ node = node.ele(itemName).txt(value).up();
178
+ }
179
+ if (values.length > maxItems) {
180
+ node = node.ele("Truncated").txt(String(values.length - maxItems)).up();
181
+ }
182
+ return node.up();
183
+ }
120
184
  function buildSchemaSection(context) {
121
- if (!context.schema) {
185
+ const schema = getSchemaObject(context);
186
+ if (!context.schema || !schema) {
122
187
  return "";
123
188
  }
124
- let xml = (0, xmlbuilder2_1.create)()
189
+ const contract = collectSchemaContract(schema);
190
+ let xml = create()
125
191
  .com("Schema section: This defines the structure of ONE RECORD (row). Each line in the JSONL output must conform to this schema.")
126
192
  .ele("Schema")
127
193
  .ele("Title").txt(context.schema.title || "").up()
128
- .ele("Description").txt(context.schema.description || "").up()
129
- .ele("JsonSchema").txt(JSON.stringify(context.schema.schema, null, 2)).up()
194
+ .ele("Description").txt(context.schema.description || "").up();
195
+ xml = xml
196
+ .ele("SchemaContract")
197
+ .ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
198
+ .ele("Rule").txt("Use only schema property keys in data objects. Input headers are input labels, not output keys.").up()
199
+ .ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
200
+ .ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize input labels to the closest valid enum literal; never emit a value outside the enum.").up();
201
+ xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
202
+ xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
203
+ let enumsXml = xml.ele("EnumConstraints");
204
+ for (const constraint of contract.enumConstraints.slice(0, 80)) {
205
+ let enumXml = enumsXml.ele("Enum", { path: constraint.path });
206
+ for (const value of constraint.values.slice(0, 80)) {
207
+ enumXml = enumXml.ele("Value").txt(value).up();
208
+ }
209
+ if (constraint.values.length > 80) {
210
+ enumXml = enumXml.ele("Truncated").txt(String(constraint.values.length - 80)).up();
211
+ }
212
+ enumsXml = enumXml.up();
213
+ }
214
+ if (contract.enumConstraints.length > 80) {
215
+ enumsXml = enumsXml.ele("Truncated").txt(String(contract.enumConstraints.length - 80)).up();
216
+ }
217
+ xml = enumsXml.up();
218
+ xml = appendLimitedList(xml, "ClosedObjectPaths", "Path", contract.closedObjectPaths, 80);
219
+ xml = xml
220
+ .up()
221
+ .ele("JsonSchema").txt(JSON.stringify(schema, null, 2)).up()
130
222
  .up();
131
223
  return xml.end({ prettyPrint: true, headless: true });
132
224
  }
133
225
  function buildInstructions(context) {
134
- const datasetWorkstation = (0, datasetFiles_1.getDatasetWorkstation)(context.datasetId);
135
- const outputPath = (0, datasetFiles_1.getDatasetOutputPath)(context.datasetId);
226
+ const datasetWorkstation = context.sandboxConfig.scriptsDir
227
+ ? context.sandboxConfig.scriptsDir.replace(/\/scripts$/, "")
228
+ : getDatasetWorkstation(context.datasetId);
229
+ const outputPath = context.sandboxConfig.outputPath ?? getDatasetOutputPath(context.datasetId);
136
230
  const hasProvidedSchema = Boolean(context.schema?.schema);
137
231
  const currentTask = hasProvidedSchema
138
232
  ? "Review FilePreview section, use the provided schema as the output contract, then parse the file and generate the dataset"
139
233
  : "Review FilePreview section to understand file structure, then generate JSON Schema for a SINGLE RECORD, then parse the file and generate the dataset";
140
- let xml = (0, xmlbuilder2_1.create)()
234
+ let xml = create()
141
235
  .ele("Instructions")
142
236
  .ele("Workflow")
143
237
  .ele("Step", { number: "1", name: "Inspect File" })
@@ -150,6 +244,11 @@ function buildInstructions(context) {
150
244
  .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
151
245
  .ele("Requirements")
152
246
  .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
247
+ .ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
248
+ .ele("Requirement").txt("Build a schema-first mapping from input columns to schema fields before writing output.jsonl. Do not use raw input headers as JSON keys unless they are exactly schema keys").up()
249
+ .ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
250
+ .ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
251
+ .ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the input language").up()
153
252
  .ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
154
253
  .up()
155
254
  .up();
@@ -173,6 +272,8 @@ function buildInstructions(context) {
173
272
  .ele("Requirements")
174
273
  .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
175
274
  .ele("Requirement").txt("Output JSONL format: each line is {\"type\": \"row\", \"data\": {...record...}}").up()
275
+ .ele("Requirement").txt("When a schema is provided, each data object must contain the exact required schema keys and must not use translated or synonymous keys").up()
276
+ .ele("Requirement").txt("When validation returns zero valid rows, treat the previous output as structurally wrong and rewrite output.jsonl from the SchemaContract, not by applying small patches").up()
176
277
  .ele("Requirement").txt("Extract ONLY data records; skip any header lines, summary sections, or file metadata").up()
177
278
  .ele("Requirement").txt(`Save output to: ${outputPath}`).up()
178
279
  .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
@@ -180,11 +281,13 @@ function buildInstructions(context) {
180
281
  .up()
181
282
  .ele("Step", { number: "4", name: "Complete and Validate" })
182
283
  .ele("Action").txt("Call completeDataset to validate the dataset").up()
183
- .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns error details if validation fails.").up()
284
+ .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
184
285
  .up()
185
286
  .up()
186
287
  .ele("Rules")
187
288
  .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
289
+ .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
290
+ .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
188
291
  .ele("Rule").txt("Datasets contain ONLY data records; exclude all header sections and file metadata").up()
189
292
  .ele("Rule").txt("JSONL format: each line = separate JSON object representing one data record").up()
190
293
  .ele("Rule").txt("FilePreview shows raw file content - use Script to understand data extraction").up()
@@ -197,7 +300,7 @@ function buildInstructions(context) {
197
300
  .up();
198
301
  return xml.end({ prettyPrint: true, headless: true });
199
302
  }
200
- function buildFileDatasetPrompt(context) {
303
+ export function buildFileDatasetPrompt(context) {
201
304
  const sections = [];
202
305
  sections.push(buildRole());
203
306
  sections.push("");
@@ -205,7 +308,11 @@ function buildFileDatasetPrompt(context) {
205
308
  sections.push("");
206
309
  sections.push(buildContextSection(context));
207
310
  sections.push("");
311
+ const schemaSection = buildSchemaSection(context);
312
+ if (schemaSection) {
313
+ sections.push(schemaSection);
314
+ sections.push("");
315
+ }
208
316
  sections.push(buildInstructions(context));
209
317
  return sections.join("\n");
210
318
  }
211
- //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ export declare const PYTHON_SCRIPT_BASE64_BY_NAME: Readonly<Record<string, string>>;