@ekairos/dataset 1.22.40-beta.development.0 → 1.22.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/dist/agents.d.ts +8 -0
  2. package/dist/agents.js +8 -0
  3. package/dist/builder/agentMaterializers.d.ts +9 -0
  4. package/dist/builder/agentMaterializers.js +10 -0
  5. package/dist/builder/context.d.ts +15 -0
  6. package/dist/builder/context.js +251 -0
  7. package/dist/builder/instructions.d.ts +4 -5
  8. package/dist/builder/instructions.js +15 -21
  9. package/dist/builder/materialize.d.ts +77 -10
  10. package/dist/builder/materialize.js +495 -152
  11. package/dist/builder/materializeQuery.d.ts +12 -0
  12. package/dist/builder/materializeQuery.js +31 -0
  13. package/dist/builder/persistence.d.ts +10 -6
  14. package/dist/builder/persistence.js +107 -62
  15. package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -1
  16. package/dist/builder/{sourceRows.js → rows.js} +3 -9
  17. package/dist/builder/schemaInference.d.ts +1 -2
  18. package/dist/builder/schemaInference.js +4 -12
  19. package/dist/builder/types.d.ts +41 -26
  20. package/dist/builder/types.js +1 -3
  21. package/dist/clearDataset.tool.d.ts +2 -3
  22. package/dist/clearDataset.tool.js +13 -17
  23. package/dist/completeDataset.steps.d.ts +117 -0
  24. package/dist/completeDataset.steps.js +537 -0
  25. package/dist/completeDataset.tool.d.ts +132 -7
  26. package/dist/completeDataset.tool.js +46 -192
  27. package/dist/contextResources.d.ts +31 -0
  28. package/dist/contextResources.js +151 -0
  29. package/dist/contextWorkspace.d.ts +79 -0
  30. package/dist/contextWorkspace.js +234 -0
  31. package/dist/dataset/steps.d.ts +39 -15
  32. package/dist/dataset/steps.js +96 -39
  33. package/dist/dataset.d.ts +2 -3
  34. package/dist/dataset.js +73 -51
  35. package/dist/datasetFiles.d.ts +5 -1
  36. package/dist/datasetFiles.js +29 -27
  37. package/dist/defineNotation.tool.d.ts +49 -0
  38. package/dist/defineNotation.tool.js +154 -0
  39. package/dist/domain.d.ts +1 -2
  40. package/dist/domain.js +1 -6
  41. package/dist/executeCommand.tool.d.ts +2 -30
  42. package/dist/executeCommand.tool.js +165 -39
  43. package/dist/file/file-dataset.agent.d.ts +19 -56
  44. package/dist/file/file-dataset.agent.js +182 -136
  45. package/dist/file/file-dataset.steps.d.ts +27 -0
  46. package/dist/file/file-dataset.steps.js +47 -0
  47. package/dist/file/file-dataset.types.d.ts +64 -0
  48. package/dist/file/file-dataset.types.js +1 -0
  49. package/dist/file/filepreview.d.ts +5 -35
  50. package/dist/file/filepreview.js +60 -107
  51. package/dist/file/filepreview.types.d.ts +31 -0
  52. package/dist/file/filepreview.types.js +1 -0
  53. package/dist/file/generateSchema.tool.d.ts +2 -3
  54. package/dist/file/generateSchema.tool.js +11 -15
  55. package/dist/file/index.d.ts +1 -2
  56. package/dist/file/index.js +1 -18
  57. package/dist/file/prompts.d.ts +2 -3
  58. package/dist/file/prompts.js +152 -32
  59. package/dist/file/scripts.generated.d.ts +1 -0
  60. package/dist/file/scripts.generated.js +11 -0
  61. package/dist/file/steps.d.ts +1 -2
  62. package/dist/file/steps.js +9 -7
  63. package/dist/id.d.ts +1 -0
  64. package/dist/id.js +10 -0
  65. package/dist/index.d.ts +9 -7
  66. package/dist/index.js +9 -23
  67. package/dist/materializeDataset.tool.d.ts +35 -28
  68. package/dist/materializeDataset.tool.js +74 -68
  69. package/dist/notation.d.ts +205 -0
  70. package/dist/notation.js +424 -0
  71. package/dist/query/index.d.ts +1 -2
  72. package/dist/query/index.js +1 -18
  73. package/dist/query/queryDomain.d.ts +3 -4
  74. package/dist/query/queryDomain.js +3 -40
  75. package/dist/query/queryDomain.step.d.ts +1 -1
  76. package/dist/query/queryDomain.step.js +24 -13
  77. package/dist/sandbox/steps.d.ts +23 -15
  78. package/dist/sandbox/steps.js +73 -76
  79. package/dist/sandbox.steps.d.ts +1 -2
  80. package/dist/sandbox.steps.js +1 -18
  81. package/dist/schema.d.ts +14 -3
  82. package/dist/schema.js +27 -26
  83. package/dist/service.d.ts +12 -5
  84. package/dist/service.js +88 -15
  85. package/dist/skill.d.ts +0 -1
  86. package/dist/skill.js +12 -17
  87. package/dist/transform/filepreview.d.ts +2 -3
  88. package/dist/transform/filepreview.js +9 -26
  89. package/dist/transform/index.d.ts +2 -3
  90. package/dist/transform/index.js +2 -8
  91. package/dist/transform/prompts.d.ts +1 -34
  92. package/dist/transform/prompts.js +66 -46
  93. package/dist/transform/transform-dataset.agent.d.ts +21 -46
  94. package/dist/transform/transform-dataset.agent.js +152 -93
  95. package/dist/transform/transform-dataset.steps.d.ts +30 -0
  96. package/dist/transform/transform-dataset.steps.js +61 -0
  97. package/dist/transform/transform-dataset.types.d.ts +96 -0
  98. package/dist/transform/transform-dataset.types.js +1 -0
  99. package/dist/transform/transformDataset.d.ts +3 -3
  100. package/dist/transform/transformDataset.js +15 -18
  101. package/dist/writeDatasetRows.tool.d.ts +188 -0
  102. package/dist/writeDatasetRows.tool.js +258 -0
  103. package/package.json +33 -8
  104. package/dist/builder/instructions.d.ts.map +0 -1
  105. package/dist/builder/instructions.js.map +0 -1
  106. package/dist/builder/materialize.d.ts.map +0 -1
  107. package/dist/builder/materialize.js.map +0 -1
  108. package/dist/builder/persistence.d.ts.map +0 -1
  109. package/dist/builder/persistence.js.map +0 -1
  110. package/dist/builder/schemaInference.d.ts.map +0 -1
  111. package/dist/builder/schemaInference.js.map +0 -1
  112. package/dist/builder/sourceRows.d.ts.map +0 -1
  113. package/dist/builder/sourceRows.js.map +0 -1
  114. package/dist/builder/types.d.ts.map +0 -1
  115. package/dist/builder/types.js.map +0 -1
  116. package/dist/clearDataset.tool.d.ts.map +0 -1
  117. package/dist/clearDataset.tool.js.map +0 -1
  118. package/dist/completeDataset.tool.d.ts.map +0 -1
  119. package/dist/completeDataset.tool.js.map +0 -1
  120. package/dist/dataset/steps.d.ts.map +0 -1
  121. package/dist/dataset/steps.js.map +0 -1
  122. package/dist/dataset.d.ts.map +0 -1
  123. package/dist/dataset.js.map +0 -1
  124. package/dist/datasetFiles.d.ts.map +0 -1
  125. package/dist/datasetFiles.js.map +0 -1
  126. package/dist/domain.d.ts.map +0 -1
  127. package/dist/domain.js.map +0 -1
  128. package/dist/eventsReactRuntime.d.ts +0 -22
  129. package/dist/eventsReactRuntime.d.ts.map +0 -1
  130. package/dist/eventsReactRuntime.js +0 -29
  131. package/dist/eventsReactRuntime.js.map +0 -1
  132. package/dist/executeCommand.tool.d.ts.map +0 -1
  133. package/dist/executeCommand.tool.js.map +0 -1
  134. package/dist/file/file-dataset.agent.d.ts.map +0 -1
  135. package/dist/file/file-dataset.agent.js.map +0 -1
  136. package/dist/file/filepreview.d.ts.map +0 -1
  137. package/dist/file/filepreview.js.map +0 -1
  138. package/dist/file/generateSchema.tool.d.ts.map +0 -1
  139. package/dist/file/generateSchema.tool.js.map +0 -1
  140. package/dist/file/index.d.ts.map +0 -1
  141. package/dist/file/index.js.map +0 -1
  142. package/dist/file/prompts.d.ts.map +0 -1
  143. package/dist/file/prompts.js.map +0 -1
  144. package/dist/file/steps.d.ts.map +0 -1
  145. package/dist/file/steps.js.map +0 -1
  146. package/dist/index.d.ts.map +0 -1
  147. package/dist/index.js.map +0 -1
  148. package/dist/materializeDataset.tool.d.ts.map +0 -1
  149. package/dist/materializeDataset.tool.js.map +0 -1
  150. package/dist/query/index.d.ts.map +0 -1
  151. package/dist/query/index.js.map +0 -1
  152. package/dist/query/queryDomain.d.ts.map +0 -1
  153. package/dist/query/queryDomain.js.map +0 -1
  154. package/dist/query/queryDomain.step.d.ts.map +0 -1
  155. package/dist/query/queryDomain.step.js.map +0 -1
  156. package/dist/sandbox/steps.d.ts.map +0 -1
  157. package/dist/sandbox/steps.js.map +0 -1
  158. package/dist/sandbox.steps.d.ts.map +0 -1
  159. package/dist/sandbox.steps.js.map +0 -1
  160. package/dist/schema.d.ts.map +0 -1
  161. package/dist/schema.js.map +0 -1
  162. package/dist/service.d.ts.map +0 -1
  163. package/dist/service.js.map +0 -1
  164. package/dist/skill.d.ts.map +0 -1
  165. package/dist/skill.js.map +0 -1
  166. package/dist/transform/filepreview.d.ts.map +0 -1
  167. package/dist/transform/filepreview.js.map +0 -1
  168. package/dist/transform/index.d.ts.map +0 -1
  169. package/dist/transform/index.js.map +0 -1
  170. package/dist/transform/prompts.d.ts.map +0 -1
  171. package/dist/transform/prompts.js.map +0 -1
  172. package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
  173. package/dist/transform/transform-dataset.agent.js.map +0 -1
  174. package/dist/transform/transformDataset.d.ts.map +0 -1
  175. package/dist/transform/transformDataset.js.map +0 -1
@@ -1,146 +1,485 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.materializeQuerySource = materializeQuerySource;
4
- exports.materializeSingleFileLikeSource = materializeSingleFileLikeSource;
5
- exports.materializeDerivedDataset = materializeDerivedDataset;
6
- const file_dataset_agent_1 = require("../file/file-dataset.agent");
7
- const service_1 = require("../service");
8
- const transform_dataset_agent_1 = require("../transform/transform-dataset.agent");
9
- const instructions_1 = require("./instructions");
10
- const persistence_1 = require("./persistence");
11
- const schemaInference_1 = require("./schemaInference");
12
- const sourceRows_1 = require("./sourceRows");
13
- function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
14
- return `${targetDatasetId}__${sourceKind}_${index}`;
15
- }
16
- async function materializeQuerySource(runtime, source, params) {
17
- const scoped = await runtime.use(source.domain);
18
- const result = await scoped.db.query(source.query);
19
- const rows = (0, sourceRows_1.normalizeQueryRows)(result);
20
- const domainDescriptor = (0, sourceRows_1.getDomainDescriptor)(source.domain);
21
- return await (0, persistence_1.materializeRowsToDataset)(runtime, {
22
- datasetId: params.datasetId,
23
- sandboxId: params.sandboxId,
24
- title: params.title ?? source.title,
25
- instructions: params.instructions,
26
- sources: [
1
+ import { createFileParseContext } from "../file/file-dataset.agent.js";
2
+ import { readInstantFileStep } from "../file/steps.js";
3
+ import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
4
+ import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
5
+ import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
6
+ import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
7
+ import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
8
+ import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
9
+ import { materializeQueryResource } from "./materializeQuery.js";
10
+ import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
11
+ function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
12
+ return `${targetDatasetId}__${resourceKind}_${index}`;
13
+ }
14
+ function normalizeParsedTextRows(value) {
15
+ if (Array.isArray(value)) {
16
+ return value.map((item) => (item && typeof item === "object" ? item : { value: item }));
17
+ }
18
+ if (value && typeof value === "object")
19
+ return [value];
20
+ return [{ value }];
21
+ }
22
+ function materializeRawTextRows(resource) {
23
+ const text = String(resource.text ?? "");
24
+ const mimeType = String(resource.mimeType ?? "").toLowerCase();
25
+ const name = String(resource.name ?? "").toLowerCase();
26
+ const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
27
+ if (shouldParseJson) {
28
+ try {
29
+ if (name.endsWith(".jsonl")) {
30
+ const rows = text
31
+ .split(/\r?\n/g)
32
+ .map((line) => line.trim())
33
+ .filter(Boolean)
34
+ .map((line) => JSON.parse(line));
35
+ return rows.flatMap((row) => normalizeParsedTextRows(row));
36
+ }
37
+ return normalizeParsedTextRows(JSON.parse(text));
38
+ }
39
+ catch {
40
+ return [{ text }];
41
+ }
42
+ }
43
+ return [{ text }];
44
+ }
45
+ function parseContentDispositionFileName(value) {
46
+ const text = String(value ?? "");
47
+ const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
48
+ if (utf8Match?.[1]) {
49
+ try {
50
+ return decodeURIComponent(utf8Match[1]).trim();
51
+ }
52
+ catch {
53
+ return utf8Match[1].trim();
54
+ }
55
+ }
56
+ const quotedMatch = /filename="([^"]+)"/i.exec(text);
57
+ if (quotedMatch?.[1])
58
+ return quotedMatch[1].trim();
59
+ const plainMatch = /filename=([^;]+)/i.exec(text);
60
+ if (plainMatch?.[1])
61
+ return plainMatch[1].trim();
62
+ return "";
63
+ }
64
+ function isPdfContentDisposition(value) {
65
+ const text = String(value ?? "").toLowerCase();
66
+ return text.includes("application/pdf") || text.includes(".pdf");
67
+ }
68
+ function sanitizeResourceFileName(value, fallback) {
69
+ const name = String(value ?? "").trim() || fallback;
70
+ const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
71
+ return cleaned || fallback;
72
+ }
73
+ function sanitizePdfFileName(value, fallback) {
74
+ const cleaned = sanitizeResourceFileName(value, fallback);
75
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
76
+ }
77
+ function pdfTextRowsSchema() {
78
+ return {
79
+ title: "PdfTextPage",
80
+ description: "Extracted PDF page text",
81
+ schema: {
82
+ type: "object",
83
+ additionalProperties: false,
84
+ required: ["fileId", "fileName", "pageNumber", "text"],
85
+ properties: {
86
+ fileId: { type: "string" },
87
+ fileName: { type: "string" },
88
+ pageNumber: { type: "number" },
89
+ text: { type: "string" },
90
+ },
91
+ },
92
+ };
93
+ }
94
+ function parseJsonlDataRows(content) {
95
+ return String(content ?? "")
96
+ .split(/\r?\n/g)
97
+ .map((line) => line.trim())
98
+ .filter(Boolean)
99
+ .map((line) => JSON.parse(line))
100
+ .map((record) => record?.data)
101
+ .filter((row) => row && typeof row === "object" && !Array.isArray(row));
102
+ }
103
+ async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
104
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
105
+ if (!isPdfContentDisposition(file.contentDisposition))
106
+ return null;
107
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
108
+ const outputPath = getDatasetOutputPath(targetDatasetId);
109
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
110
+ const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
111
+ const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
112
+ await runDatasetSandboxCommandStep({
113
+ runtime: state.runtime,
114
+ sandboxId,
115
+ cmd: "mkdir",
116
+ args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
117
+ });
118
+ await writeDatasetSandboxFilesStep({
119
+ runtime: state.runtime,
120
+ sandboxId,
121
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
122
+ });
123
+ const install = await runDatasetSandboxCommandStep({
124
+ runtime: state.runtime,
125
+ sandboxId,
126
+ cmd: "python",
127
+ args: ["-m", "pip", "install", "pypdf", "--quiet"],
128
+ });
129
+ if (install.exitCode !== 0) {
130
+ throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
131
+ }
132
+ await writeDatasetSandboxTextFilesStep({
133
+ runtime: state.runtime,
134
+ sandboxId,
135
+ files: [
27
136
  {
28
- kind: "query",
29
- query: source.query,
30
- title: source.title,
31
- explanation: source.explanation,
32
- ...domainDescriptor,
137
+ path: scriptPath,
138
+ content: [
139
+ "from pathlib import Path",
140
+ "import json",
141
+ "import sys",
142
+ "from pypdf import PdfReader",
143
+ "",
144
+ "resource_path = Path(sys.argv[1])",
145
+ "output_path = Path(sys.argv[2])",
146
+ "file_id = sys.argv[3]",
147
+ "file_name = sys.argv[4]",
148
+ "reader = PdfReader(str(resource_path))",
149
+ "rows = 0",
150
+ "with output_path.open('w', encoding='utf-8') as out:",
151
+ " for index, page in enumerate(reader.pages, start=1):",
152
+ " text = page.extract_text() or ''",
153
+ " text = text.replace('\\x00', '').strip()",
154
+ " if not text:",
155
+ " continue",
156
+ " data = {",
157
+ " 'fileId': file_id,",
158
+ " 'fileName': file_name,",
159
+ " 'pageNumber': index,",
160
+ " 'text': text,",
161
+ " }",
162
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
163
+ " rows += 1",
164
+ " if rows == 0:",
165
+ " data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
166
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
167
+ " rows = 1",
168
+ "print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
169
+ "",
170
+ ].join("\n"),
33
171
  },
34
172
  ],
35
- sourceKinds: ["query"],
36
- analysis: {
37
- query: source.query,
38
- explanation: source.explanation,
39
- ...domainDescriptor,
40
- },
173
+ });
174
+ const extraction = await runDatasetSandboxCommandStep({
175
+ runtime: state.runtime,
176
+ sandboxId,
177
+ cmd: "python",
178
+ args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
179
+ });
180
+ if (extraction.exitCode !== 0) {
181
+ throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
182
+ }
183
+ const output = await readDatasetSandboxTextFileStep({
184
+ runtime: state.runtime,
185
+ sandboxId,
186
+ path: outputPath,
187
+ });
188
+ const rows = parseJsonlDataRows(output.content);
189
+ if (rows.length === 0) {
190
+ throw new Error("dataset_pdf_text_extraction_empty");
191
+ }
192
+ await materializeRowsToDataset(state.runtime, {
193
+ datasetId: targetDatasetId,
194
+ sandboxId,
195
+ title: state.title ?? fileName,
196
+ instructions: state.instructions,
197
+ contextId: state.contextId ?? "",
198
+ rows,
199
+ schema: pdfTextRowsSchema(),
200
+ first: state.first,
201
+ });
202
+ return targetDatasetId;
203
+ }
204
+ async function materializeRawTextResource(state, resource, targetDatasetId) {
205
+ const rows = materializeRawTextRows(resource);
206
+ await materializeRowsToDataset(state.runtime, {
207
+ datasetId: targetDatasetId,
208
+ sandboxId: state.sandboxId,
209
+ title: state.title ?? resource.name ?? targetDatasetId,
210
+ instructions: state.instructions,
211
+ contextId: state.contextId ?? "",
41
212
  rows,
213
+ schema: state.outputSchema,
214
+ first: state.first,
215
+ });
216
+ return targetDatasetId;
217
+ }
218
+ async function writePreparedFileResourceToSandbox(params) {
219
+ const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
220
+ const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
221
+ const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
222
+ const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
223
+ await runDatasetSandboxCommandStep({
224
+ runtime: params.runtime,
225
+ sandboxId: params.sandboxId,
226
+ cmd: "mkdir",
227
+ args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
228
+ });
229
+ await writeDatasetSandboxFilesStep({
230
+ runtime: params.runtime,
231
+ sandboxId: params.sandboxId,
232
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
233
+ });
234
+ return { fileName, resourcePath };
235
+ }
236
+ function resolveDatasetSandboxId(state, _targetDatasetId) {
237
+ const sandboxId = String(state.sandboxId ?? "").trim();
238
+ if (sandboxId)
239
+ return sandboxId;
240
+ throw new Error("dataset_sandbox_required");
241
+ }
242
+ export async function resolveDatasetAgentDurable(requestedDurable) {
243
+ if (!requestedDurable)
244
+ return false;
245
+ try {
246
+ const { getWorkflowMetadata } = await import("workflow");
247
+ const workflowRunId = getWorkflowMetadata?.()?.workflowRunId;
248
+ if (workflowRunId)
249
+ return false;
250
+ }
251
+ catch {
252
+ // Outside Workflow runtime there is no active metadata, so honor the caller.
253
+ }
254
+ return true;
255
+ }
256
+ export async function initializeDatasetStep(params) {
257
+ "use step";
258
+ await createOrUpdateDatasetMetadata(params.runtime, {
259
+ datasetId: params.datasetId,
260
+ sandboxId: params.sandboxId,
261
+ title: params.title ?? params.datasetId,
262
+ instructions: params.instructions,
263
+ contextId: params.contextId,
42
264
  schema: params.schema,
43
- inferSchema: !params.schema,
44
- first: params.first,
265
+ status: "building",
266
+ });
267
+ return {
268
+ datasetId: params.datasetId,
269
+ sandboxId: params.sandboxId,
270
+ };
271
+ }
272
+ export async function prepareDatasetResourcesStep(params) {
273
+ "use step";
274
+ if (params.kind === "file") {
275
+ const fileId = params.resource.kind === "file"
276
+ ? params.resource.fileId
277
+ : await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
278
+ return {
279
+ kind: "file",
280
+ datasetId: params.datasetId,
281
+ sandboxId: params.sandboxId,
282
+ fileId,
283
+ sandboxState: { initialized: false, filePath: "" },
284
+ filePreview: undefined,
285
+ schema: params.schema ?? null,
286
+ filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
287
+ mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
288
+ };
289
+ }
290
+ return {
291
+ kind: "transform",
292
+ datasetId: params.datasetId,
293
+ sandboxId: params.sandboxId,
294
+ inputDatasetIds: params.inputDatasetIds,
295
+ outputSchema: params.outputSchema,
296
+ sandboxState: { initialized: false, inputPaths: [] },
297
+ inputPreviews: undefined,
298
+ };
299
+ }
300
+ export async function initializeDatasetContextStep(params) {
301
+ "use step";
302
+ if (params.prepared.kind === "file") {
303
+ return {
304
+ ...params.prepared,
305
+ instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
306
+ prompt: "generate a dataset for this file",
307
+ };
308
+ }
309
+ return {
310
+ ...params.prepared,
311
+ instructions: params.instructions,
312
+ prompt: params.prepared.inputDatasetIds.length === 1
313
+ ? "Transform the input dataset into a new dataset matching the provided output schema"
314
+ : `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
315
+ };
316
+ }
317
+ export async function completeDatasetStep(params) {
318
+ "use step";
319
+ let datasetResult = await datasetGetByIdStep({
320
+ runtime: params.runtime,
321
+ datasetId: params.datasetId,
322
+ });
323
+ if (!datasetResult.ok)
324
+ throw new Error(datasetResult.error);
325
+ if (!params.schema && !datasetResult.data?.schema) {
326
+ await datasetInferAndUpdateSchemaStep({
327
+ runtime: params.runtime,
328
+ datasetId: params.datasetId,
329
+ title: `${params.datasetId}Row`,
330
+ description: "One dataset row",
331
+ });
332
+ datasetResult = await datasetGetByIdStep({
333
+ runtime: params.runtime,
334
+ datasetId: params.datasetId,
335
+ });
336
+ if (!datasetResult.ok)
337
+ throw new Error(datasetResult.error);
338
+ }
339
+ const previewResult = await datasetPreviewRowsStep({
340
+ runtime: params.runtime,
341
+ datasetId: params.datasetId,
342
+ limit: 20,
343
+ });
344
+ if (!params.first) {
345
+ return {
346
+ datasetId: params.datasetId,
347
+ dataset: datasetResult.data,
348
+ previewRows: previewResult.rows,
349
+ firstRow: undefined,
350
+ };
351
+ }
352
+ const firstResult = await datasetReadOneStep({
353
+ runtime: params.runtime,
354
+ datasetId: params.datasetId,
45
355
  });
356
+ return {
357
+ datasetId: params.datasetId,
358
+ dataset: datasetResult.data,
359
+ previewRows: previewResult.rows,
360
+ firstRow: firstResult.row,
361
+ };
46
362
  }
47
- async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
363
+ export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
364
+ if (resource.kind === "file" && !state.outputSchema) {
365
+ const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
366
+ if (materializedPdf)
367
+ return materializedPdf;
368
+ }
369
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
48
370
  if (!state.reactor) {
49
371
  throw new Error("dataset_reactor_required");
50
372
  }
51
- if (!state.sandboxId) {
52
- throw new Error("dataset_sandbox_required");
53
- }
54
- const fileId = source.kind === "file"
55
- ? source.fileId
56
- : await (0, persistence_1.uploadInlineTextSource)(state.runtime, targetDatasetId, source);
57
- await (0, persistence_1.createOrUpdateDatasetMetadata)(state.runtime, {
373
+ await initializeDatasetStep({
374
+ runtime: state.runtime,
58
375
  datasetId: targetDatasetId,
59
- sandboxId: state.sandboxId,
376
+ sandboxId,
60
377
  title: state.title ?? targetDatasetId,
61
378
  instructions: state.instructions,
62
- sources: [
63
- source.kind === "file"
64
- ? { kind: "file", fileId: source.fileId, description: source.description }
65
- : {
66
- kind: "text",
67
- mimeType: source.mimeType,
68
- name: source.name,
69
- description: source.description,
70
- },
71
- ],
72
- sourceKinds: [source.kind],
379
+ contextId: state.contextId ?? "",
73
380
  schema: state.outputSchema,
74
- status: "building",
75
381
  });
76
- const parseStory = (0, file_dataset_agent_1.createFileParseStory)(fileId, {
382
+ const prepared = await prepareDatasetResourcesStep({
383
+ kind: "file",
384
+ runtime: state.runtime,
77
385
  datasetId: targetDatasetId,
78
- instructions: state.instructions ?? (0, instructions_1.buildFileDefaultInstructions)(state.outputSchema),
79
- reactor: state.reactor,
80
- sandboxId: state.sandboxId,
386
+ sandboxId,
387
+ resource,
388
+ schema: state.outputSchema,
81
389
  });
82
- await parseStory.parse(state.env);
83
- if (!state.outputSchema) {
84
- const db = await (0, persistence_1.getDatasetDb)(state.runtime);
85
- const service = new service_1.DatasetService(db);
86
- const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
87
- if (!readResult.ok) {
88
- throw new Error(readResult.error);
89
- }
90
- const inferred = (0, schemaInference_1.inferDatasetSchema)(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
91
- const updateResult = await service.updateDatasetSchema({
92
- datasetId: targetDatasetId,
93
- schema: inferred,
94
- status: "completed",
95
- });
96
- if (!updateResult.ok) {
97
- throw new Error(updateResult.error);
98
- }
390
+ if (prepared.kind !== "file") {
391
+ throw new Error("dataset_context_kind_mismatch:file");
99
392
  }
100
- if (state.first) {
101
- const db = await (0, persistence_1.getDatasetDb)(state.runtime);
102
- const service = new service_1.DatasetService(db);
103
- const firstResult = await service.readOne(targetDatasetId);
104
- if (!firstResult.ok) {
105
- throw new Error(firstResult.error);
106
- }
393
+ const preparedFile = await writePreparedFileResourceToSandbox({
394
+ runtime: state.runtime,
395
+ sandboxId,
396
+ datasetId: targetDatasetId,
397
+ fileId: prepared.fileId,
398
+ filename: prepared.filename,
399
+ });
400
+ const context = await initializeDatasetContextStep({
401
+ prepared: {
402
+ ...prepared,
403
+ filename: prepared.filename ?? preparedFile.fileName,
404
+ },
405
+ instructions: state.instructions,
406
+ outputSchema: state.outputSchema,
407
+ });
408
+ if (context.kind !== "file") {
409
+ throw new Error("dataset_context_kind_mismatch:file");
107
410
  }
411
+ const parseContext = createFileParseContext(context.fileId, {
412
+ datasetId: context.datasetId,
413
+ instructions: context.instructions,
414
+ reactor: state.reactor,
415
+ sandboxId: context.sandboxId,
416
+ sandboxState: context.sandboxState,
417
+ filePreview: context.filePreview,
418
+ schema: context.schema,
419
+ filename: context.filename,
420
+ mediaType: context.mediaType,
421
+ });
422
+ await parseContext.parse(state.runtime, {
423
+ durable: await resolveDatasetAgentDurable(state.durable),
424
+ prompt: context.prompt,
425
+ initialContent: {
426
+ datasetId: context.datasetId,
427
+ fileId: context.fileId,
428
+ instructions: context.instructions ?? "",
429
+ sandboxId: context.sandboxId,
430
+ sandboxState: context.sandboxState,
431
+ filePreview: context.filePreview,
432
+ schema: context.schema,
433
+ filename: context.filename,
434
+ mediaType: context.mediaType,
435
+ },
436
+ });
108
437
  return targetDatasetId;
109
438
  }
110
- async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
111
- if (source.kind === "dataset") {
112
- return source.datasetId;
439
+ async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
440
+ if (resource.kind === "dataset") {
441
+ return resource.datasetId;
113
442
  }
114
- const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, source.kind, sourceIndex);
115
- if (source.kind === "query") {
116
- await materializeQuerySource(state.runtime, source, {
443
+ const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
444
+ if (resource.kind === "query") {
445
+ await materializeQueryResource(state.runtime, resource, {
117
446
  datasetId: intermediateDatasetId,
118
447
  sandboxId: state.sandboxId,
119
- title: source.title,
448
+ title: resource.title,
120
449
  first: false,
450
+ contextId: state.contextId ?? "",
121
451
  });
122
452
  return intermediateDatasetId;
123
453
  }
124
- await materializeSingleFileLikeSource({
454
+ if (resource.kind === "text") {
455
+ await materializeRawTextResource({
456
+ ...state,
457
+ outputSchema: undefined,
458
+ first: false,
459
+ instructions: buildRawResourceInstructions(resource.kind),
460
+ title: resource.name ?? state.title,
461
+ }, resource, intermediateDatasetId);
462
+ return intermediateDatasetId;
463
+ }
464
+ if (resource.kind === "context") {
465
+ throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
466
+ }
467
+ await materializeSingleFileLikeResource({
125
468
  ...state,
126
469
  outputSchema: undefined,
127
470
  first: false,
128
- instructions: (0, instructions_1.buildRawSourceInstructions)(source.kind),
129
- }, source, intermediateDatasetId);
471
+ instructions: buildRawResourceInstructions(resource.kind),
472
+ }, resource, intermediateDatasetId);
130
473
  return intermediateDatasetId;
131
474
  }
132
- async function materializeDerivedDataset(state, targetDatasetId) {
475
+ export async function materializeDerivedDataset(state, targetDatasetId) {
133
476
  if (!state.reactor) {
134
477
  throw new Error("dataset_reactor_required");
135
478
  }
136
- if (!state.sandboxId) {
137
- throw new Error("dataset_sandbox_required");
138
- }
139
- const normalizedSources = [];
140
- for (let index = 0; index < state.sources.length; index++) {
141
- normalizedSources.push(await normalizeSourceToDatasetId(state, state.sources[index], targetDatasetId, index));
142
- }
143
- const transformSchema = state.outputSchema ??
479
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
480
+ const stateWithSandbox = { ...state, sandboxId };
481
+ const inputDatasetIds = (stateWithSandbox.contextResources ?? []).map((resource, index) => String(resource.datasetId ?? resource.key ?? `resource_${index + 1}`));
482
+ const transformSchema = stateWithSandbox.outputSchema ??
144
483
  {
145
484
  title: "DatasetRow",
146
485
  description: "One dataset row",
@@ -150,56 +489,60 @@ async function materializeDerivedDataset(state, targetDatasetId) {
150
489
  properties: {},
151
490
  },
152
491
  };
153
- await (0, persistence_1.createOrUpdateDatasetMetadata)(state.runtime, {
492
+ await initializeDatasetStep({
493
+ runtime: stateWithSandbox.runtime,
154
494
  datasetId: targetDatasetId,
155
- sandboxId: state.sandboxId,
156
- title: state.title ?? targetDatasetId,
157
- instructions: state.instructions,
158
- sources: state.sources.map((source) => source.kind === "query"
159
- ? {
160
- kind: "query",
161
- query: source.query,
162
- title: source.title,
163
- explanation: source.explanation,
164
- ...(0, sourceRows_1.getDomainDescriptor)(source.domain),
165
- }
166
- : source),
167
- sourceKinds: state.sources.map((source) => source.kind),
495
+ sandboxId,
496
+ title: stateWithSandbox.title ?? targetDatasetId,
497
+ instructions: stateWithSandbox.instructions,
498
+ contextId: stateWithSandbox.contextId ?? "",
168
499
  schema: transformSchema,
169
- status: "building",
170
500
  });
171
- const transformStory = (0, transform_dataset_agent_1.createTransformDatasetStory)({
172
- sourceDatasetIds: normalizedSources,
173
- outputSchema: transformSchema,
174
- instructions: (0, instructions_1.buildTransformInstructions)(normalizedSources.length, state.instructions, state.outputSchema),
501
+ const prepared = {
502
+ kind: "transform",
175
503
  datasetId: targetDatasetId,
176
- reactor: state.reactor,
177
- sandboxId: state.sandboxId,
504
+ sandboxId,
505
+ inputDatasetIds,
506
+ outputSchema: transformSchema,
507
+ sandboxState: { initialized: false, inputPaths: [] },
508
+ inputPreviews: undefined,
509
+ };
510
+ const context = await initializeDatasetContextStep({
511
+ prepared,
512
+ instructions: buildTransformInstructions(inputDatasetIds.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
513
+ outputSchema: transformSchema,
178
514
  });
179
- await transformStory.transform(state.env);
180
- const db = await (0, persistence_1.getDatasetDb)(state.runtime);
181
- const service = new service_1.DatasetService(db);
182
- if (!state.outputSchema) {
183
- const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
184
- if (!readResult.ok) {
185
- throw new Error(readResult.error);
186
- }
187
- const inferred = (0, schemaInference_1.inferDatasetSchema)(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
188
- const updateResult = await service.updateDatasetSchema({
189
- datasetId: targetDatasetId,
190
- schema: inferred,
191
- status: "completed",
192
- });
193
- if (!updateResult.ok) {
194
- throw new Error(updateResult.error);
195
- }
196
- }
197
- if (state.first) {
198
- const firstResult = await service.readOne(targetDatasetId);
199
- if (!firstResult.ok) {
200
- throw new Error(firstResult.error);
201
- }
515
+ if (context.kind !== "transform") {
516
+ throw new Error("dataset_context_kind_mismatch:transform");
202
517
  }
518
+ const transformContext = createTransformDatasetContext({
519
+ inputDatasetIds: context.inputDatasetIds,
520
+ outputSchema: context.outputSchema,
521
+ instructions: context.instructions,
522
+ datasetId: context.datasetId,
523
+ reactor: stateWithSandbox.reactor,
524
+ sandboxId: context.sandboxId,
525
+ sandboxState: context.sandboxState,
526
+ inputPreviews: context.inputPreviews,
527
+ contextResources: stateWithSandbox.contextResources ?? [],
528
+ });
529
+ await transformContext.transform(stateWithSandbox.runtime, {
530
+ durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
531
+ prompt: context.prompt,
532
+ initialContent: {
533
+ datasetId: context.datasetId,
534
+ inputDatasetIds: context.inputDatasetIds,
535
+ outputSchema: context.outputSchema,
536
+ instructions: context.instructions,
537
+ sandboxId: context.sandboxId,
538
+ sandboxState: context.sandboxState,
539
+ inputPreviews: context.inputPreviews,
540
+ contextResources: stateWithSandbox.contextResources ?? [],
541
+ },
542
+ });
203
543
  return targetDatasetId;
204
544
  }
205
- //# sourceMappingURL=materialize.js.map
545
+ registerDatasetAgentMaterializers({
546
+ materializeSingleFileLikeResource,
547
+ materializeDerivedDataset,
548
+ });