@ekairos/structure 1.21.74-beta.0 → 1.21.77-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,6 +63,7 @@ export async function structurePatchContextContentStep(params) {
63
63
  }
64
64
  export async function structureUploadRowsOutputJsonlStep(params) {
65
65
  "use step";
66
+ const startedAt = Date.now();
66
67
  try {
67
68
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
68
69
  const runtime = await resolveStoryRuntime(params.env);
@@ -76,6 +77,7 @@ export async function structureUploadRowsOutputJsonlStep(params) {
76
77
  const fileId = uploadResult?.data?.id;
77
78
  if (!fileId)
78
79
  return { ok: false, error: "Failed to upload file to storage" };
80
+ console.log(`[structure:upload-jsonl] structureId=${params.structureId} bytes=${fileBuffer.byteLength} elapsedMs=${Date.now() - startedAt}`);
79
81
  return { ok: true, data: { fileId, storagePath } };
80
82
  }
81
83
  catch (error) {
@@ -85,6 +87,7 @@ export async function structureUploadRowsOutputJsonlStep(params) {
85
87
  }
86
88
  export async function structureLinkRowsOutputFileToContextStep(params) {
87
89
  "use step";
90
+ const startedAt = Date.now();
88
91
  try {
89
92
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
90
93
  const runtime = await resolveStoryRuntime(params.env);
@@ -95,6 +98,7 @@ export async function structureLinkRowsOutputFileToContextStep(params) {
95
98
  if (!ctxId)
96
99
  return { ok: false, error: "Context not found" };
97
100
  await db.transact([db.tx.context_contexts[ctxId].link({ structure_output_file: params.fileId })]);
101
+ console.log(`[structure:link-jsonl] contextKey=${params.contextKey} fileId=${params.fileId} elapsedMs=${Date.now() - startedAt}`);
98
102
  return { ok: true };
99
103
  }
100
104
  catch (error) {
@@ -145,6 +149,7 @@ export async function structureGetContextWithRowsOutputFileStep(params) {
145
149
  }
146
150
  export async function structureReadRowsOutputJsonlStep(params) {
147
151
  "use step";
152
+ const startedAt = Date.now();
148
153
  try {
149
154
  const contextKey = `structure:${params.structureId}`;
150
155
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
@@ -164,6 +169,7 @@ export async function structureReadRowsOutputJsonlStep(params) {
164
169
  if (!url)
165
170
  return { ok: false, error: "Rows output file not found" };
166
171
  const fileBuffer = await fetchArrayBufferWithRetry(url, { attempts: 4, timeoutMs: 90000 });
172
+ console.log(`[structure:read-jsonl] structureId=${params.structureId} bytes=${fileBuffer.byteLength} elapsedMs=${Date.now() - startedAt}`);
167
173
  return { ok: true, data: { contentBase64: Buffer.from(fileBuffer).toString("base64") } };
168
174
  }
169
175
  catch (error) {
@@ -1,5 +1,11 @@
1
1
  export declare const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
2
2
  export declare const DATASET_OUTPUT_SCHEMA_FILE_NAME = "output_schema.json";
3
+ export declare const DEFAULT_DAYTONA_VOLUME_MOUNT_PATH = "/home/daytona/.ekairos";
4
+ export declare const DEFAULT_DAYTONA_VOLUME_NAME = "ekairos-structure";
5
+ export declare const DEFAULT_DATASET_WORKDIR_BASE = "/home/daytona/.ekairos/datasets";
6
+ export declare function getDaytonaVolumeMountPath(): string;
7
+ export declare function getDaytonaVolumeName(): string;
8
+ export declare function getDatasetWorkdirBase(): string;
3
9
  export declare function getDatasetWorkstation(datasetId: string): string;
4
10
  export declare function getDatasetOutputPath(datasetId: string): string;
5
11
  export declare function getDatasetOutputSchemaPath(datasetId: string): string;
@@ -1,7 +1,31 @@
1
1
  export const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
2
2
  export const DATASET_OUTPUT_SCHEMA_FILE_NAME = "output_schema.json";
3
+ export const DEFAULT_DAYTONA_VOLUME_MOUNT_PATH = "/home/daytona/.ekairos";
4
+ export const DEFAULT_DAYTONA_VOLUME_NAME = "ekairos-structure";
5
+ export const DEFAULT_DATASET_WORKDIR_BASE = `${DEFAULT_DAYTONA_VOLUME_MOUNT_PATH}/datasets`;
6
+ function trimTrailingSlash(value) {
7
+ return value.endsWith("/") ? value.slice(0, -1) : value;
8
+ }
9
+ export function getDaytonaVolumeMountPath() {
10
+ const fromEnv = String(process.env.STRUCTURE_DAYTONA_VOLUME_MOUNT_PATH ?? "").trim();
11
+ if (fromEnv)
12
+ return trimTrailingSlash(fromEnv);
13
+ return DEFAULT_DAYTONA_VOLUME_MOUNT_PATH;
14
+ }
15
+ export function getDaytonaVolumeName() {
16
+ const fromEnv = String(process.env.STRUCTURE_DAYTONA_VOLUME_NAME ?? "").trim();
17
+ if (fromEnv)
18
+ return fromEnv;
19
+ return DEFAULT_DAYTONA_VOLUME_NAME;
20
+ }
21
+ export function getDatasetWorkdirBase() {
22
+ const fromEnv = String(process.env.STRUCTURE_SANDBOX_WORKDIR_BASE ?? "").trim();
23
+ if (fromEnv)
24
+ return trimTrailingSlash(fromEnv);
25
+ return `${getDaytonaVolumeMountPath()}/datasets`;
26
+ }
3
27
  export function getDatasetWorkstation(datasetId) {
4
- return `/vercel/sandbox/datasets/${datasetId}`;
28
+ return `${getDatasetWorkdirBase()}/${datasetId}`;
5
29
  }
6
30
  export function getDatasetOutputPath(datasetId) {
7
31
  return `${getDatasetWorkstation(datasetId)}/${DATASET_OUTPUT_FILE_NAME}`;
package/dist/prompts.d.ts CHANGED
@@ -13,5 +13,12 @@ export type StructurePromptContext = {
13
13
  sources: PreparedSource[];
14
14
  workstation: string;
15
15
  outputPath: string;
16
+ sandboxProvider?: string;
17
+ sandboxRuntime?: string;
18
+ sandboxEphemeral?: boolean;
19
+ sandboxVolumeName?: string;
20
+ sandboxVolumeMountPath?: string;
21
+ sandboxSnapshot?: string;
22
+ sandboxImage?: string;
16
23
  };
17
24
  export declare function buildStructurePrompt(ctx: StructurePromptContext): string;
package/dist/prompts.js CHANGED
@@ -24,6 +24,19 @@ export function buildStructurePrompt(ctx) {
24
24
  }
25
25
  lines.push("");
26
26
  lines.push("## SANDBOX");
27
+ if (ctx.sandboxProvider)
28
+ lines.push(`Provider: ${ctx.sandboxProvider}`);
29
+ if (ctx.sandboxRuntime)
30
+ lines.push(`Runtime: ${ctx.sandboxRuntime}`);
31
+ if (ctx.sandboxEphemeral !== undefined)
32
+ lines.push(`Ephemeral: ${ctx.sandboxEphemeral ? "true" : "false"}`);
33
+ if (ctx.sandboxVolumeName || ctx.sandboxVolumeMountPath) {
34
+ lines.push(`Volume: ${ctx.sandboxVolumeName ?? "unknown"} -> ${ctx.sandboxVolumeMountPath ?? "unknown"}`);
35
+ }
36
+ if (ctx.sandboxSnapshot)
37
+ lines.push(`Snapshot: ${ctx.sandboxSnapshot}`);
38
+ if (ctx.sandboxImage)
39
+ lines.push(`Image: ${ctx.sandboxImage}`);
27
40
  lines.push(`Workstation: ${ctx.workstation}`);
28
41
  lines.push(`OutputPath: ${ctx.outputPath}`);
29
42
  lines.push("");
@@ -28,6 +28,15 @@ export async function structureDownloadRowsOutputToSandboxStep(params) {
28
28
  cmd: "mkdir",
29
29
  args: ["-p", workstation],
30
30
  });
31
+ const exists = await runDatasetSandboxCommandStep({
32
+ env: params.env,
33
+ sandboxId,
34
+ cmd: "test",
35
+ args: ["-f", localPath],
36
+ });
37
+ if (exists.exitCode === 0) {
38
+ return { sandboxId, localPath };
39
+ }
31
40
  const storyRuntime = await getStoryRuntime(params.env);
32
41
  const db = storyRuntime.db;
33
42
  const contextKey = `structure:${params.structureId}`;
@@ -1,13 +1,8 @@
1
+ import type { SandboxConfig } from "@ekairos/sandbox";
1
2
  export type DatasetSandboxId = string;
2
- export type CreateDatasetSandboxParams = {
3
- runtime?: string;
4
- timeoutMs?: number;
5
- ports?: number[];
6
- resources?: {
7
- vcpus?: number;
8
- };
9
- purpose?: string;
10
- params?: Record<string, any>;
3
+ export type CreateDatasetSandboxParams = Omit<SandboxConfig, "provider" | "daytona"> & {
4
+ provider?: SandboxConfig["provider"];
5
+ daytona?: SandboxConfig["daytona"];
11
6
  };
12
7
  export type DatasetSandboxRunCommandResult = {
13
8
  exitCode: number;
@@ -1,16 +1,186 @@
1
+ import { getDatasetWorkdirBase, getDaytonaVolumeMountPath, getDaytonaVolumeName } from "../datasetFiles";
2
+ function parseOptionalNumber(value) {
3
+ const parsed = Number(value);
4
+ if (!Number.isFinite(parsed))
5
+ return undefined;
6
+ return parsed;
7
+ }
8
+ function parseOptionalBoolean(value) {
9
+ const normalized = String(value ?? "").trim().toLowerCase();
10
+ if (!normalized)
11
+ return undefined;
12
+ if (["1", "true", "yes", "y", "on"].includes(normalized))
13
+ return true;
14
+ if (["0", "false", "no", "n", "off"].includes(normalized))
15
+ return false;
16
+ return undefined;
17
+ }
18
+ function shouldLogDaytonaResources() {
19
+ const explicit = parseOptionalBoolean(process.env.STRUCTURE_DAYTONA_LOG_RESOURCES);
20
+ return explicit ?? false;
21
+ }
22
+ function getDaytonaConfigFromEnv() {
23
+ const apiKey = String(process.env.DAYTONA_API_KEY ?? "").trim();
24
+ const apiUrl = String(process.env.DAYTONA_API_URL ?? "").trim() ||
25
+ String(process.env.DAYTONA_SERVER_URL ?? "").trim();
26
+ const jwtToken = String(process.env.DAYTONA_JWT_TOKEN ?? "").trim();
27
+ const organizationId = String(process.env.DAYTONA_ORGANIZATION_ID ?? "").trim();
28
+ const target = String(process.env.DAYTONA_TARGET ?? "").trim();
29
+ if (!apiUrl)
30
+ return null;
31
+ if (!apiKey && !(jwtToken && organizationId))
32
+ return null;
33
+ return {
34
+ apiUrl,
35
+ apiKey: apiKey || undefined,
36
+ jwtToken: jwtToken || undefined,
37
+ organizationId: organizationId || undefined,
38
+ target: target || undefined,
39
+ };
40
+ }
41
+ async function logDaytonaResources(label) {
42
+ if (!shouldLogDaytonaResources())
43
+ return;
44
+ const cfg = getDaytonaConfigFromEnv();
45
+ if (!cfg) {
46
+ console.log(`[daytona:${label}] missing Daytona env config`);
47
+ return;
48
+ }
49
+ try {
50
+ const moduleName = "@daytonaio/sdk";
51
+ const importer = new Function("m", "return import(m)");
52
+ const { Daytona } = (await importer(moduleName));
53
+ const daytona = new Daytona(cfg);
54
+ const list = await daytona.list(undefined, 1, 50);
55
+ const total = list?.total ?? list?.items?.length ?? 0;
56
+ console.log(`[daytona:${label}] sandboxes total=${total} page=${list?.page ?? 1}`);
57
+ const items = Array.isArray(list?.items) ? list.items : [];
58
+ for (const sb of items.slice(0, 25)) {
59
+ console.log(`[daytona:${label}] sandbox id=${sb?.id} state=${sb?.state} disk=${sb?.disk} cpu=${sb?.cpu} memory=${sb?.memory} autoDelete=${sb?.autoDeleteInterval} autoStop=${sb?.autoStopInterval} snapshot=${sb?.snapshot ?? ""} createdAt=${sb?.createdAt ?? ""}`);
60
+ }
61
+ try {
62
+ const volumes = await daytona.volume.list();
63
+ const names = volumes.map((v) => v?.name).filter(Boolean);
64
+ console.log(`[daytona:${label}] volumes count=${volumes.length} names=${names.slice(0, 20).join(",")}`);
65
+ }
66
+ catch (e) {
67
+ console.log(`[daytona:${label}] volumes list error: ${e instanceof Error ? e.message : String(e)}`);
68
+ }
69
+ }
70
+ catch (e) {
71
+ console.log(`[daytona:${label}] list error: ${e instanceof Error ? e.message : String(e)}`);
72
+ }
73
+ }
74
+ function getStructureDaytonaDefaults() {
75
+ const snapshot = String(process.env.STRUCTURE_DAYTONA_SNAPSHOT ?? "").trim();
76
+ const image = String(process.env.STRUCTURE_DAYTONA_IMAGE ?? "").trim();
77
+ const declarative = String(process.env.STRUCTURE_DAYTONA_DECLARATIVE_IMAGE ?? "").trim();
78
+ const ephemeralEnv = parseOptionalBoolean(process.env.STRUCTURE_DAYTONA_EPHEMERAL);
79
+ const ephemeral = ephemeralEnv ?? true;
80
+ const autoStopIntervalMin = parseOptionalNumber(process.env.STRUCTURE_DAYTONA_AUTO_STOP_MIN);
81
+ const autoArchiveIntervalMin = parseOptionalNumber(process.env.STRUCTURE_DAYTONA_AUTO_ARCHIVE_MIN);
82
+ const autoDeleteIntervalMin = parseOptionalNumber(process.env.STRUCTURE_DAYTONA_AUTO_DELETE_MIN);
83
+ const volumeName = getDaytonaVolumeName();
84
+ const mountPath = getDaytonaVolumeMountPath();
85
+ const volumes = volumeName && mountPath
86
+ ? [
87
+ {
88
+ volumeName,
89
+ mountPath,
90
+ },
91
+ ]
92
+ : [];
93
+ return {
94
+ snapshot: snapshot || undefined,
95
+ image: image || (declarative ? "declarative" : undefined),
96
+ ephemeral,
97
+ autoStopIntervalMin,
98
+ autoArchiveIntervalMin,
99
+ autoDeleteIntervalMin,
100
+ volumes,
101
+ };
102
+ }
1
103
  export async function createDatasetSandboxStep(params) {
2
104
  "use step";
105
+ const startedAt = Date.now();
106
+ const { env, ...configInput } = params;
3
107
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
4
- const db = (await resolveStoryRuntime(params.env)).db;
108
+ const db = (await resolveStoryRuntime(env)).db;
5
109
  const { SandboxService } = (await import("@ekairos/sandbox"));
6
110
  const service = new SandboxService(db);
7
- const created = await service.createSandbox(params);
8
- if (!created.ok)
111
+ const daytonaDefaults = getStructureDaytonaDefaults();
112
+ const explicitVolumes = configInput.daytona?.volumes;
113
+ const mergedDaytona = {
114
+ ...daytonaDefaults,
115
+ ...(configInput.daytona ?? {}),
116
+ volumes: Array.isArray(explicitVolumes) ? explicitVolumes : daytonaDefaults.volumes,
117
+ };
118
+ const vcpusOverride = parseOptionalNumber(process.env.STRUCTURE_DAYTONA_VCPUS);
119
+ const mergedResources = configInput.resources ?? (vcpusOverride ? { vcpus: vcpusOverride } : undefined);
120
+ const config = {
121
+ ...configInput,
122
+ provider: "daytona",
123
+ resources: mergedResources,
124
+ daytona: mergedDaytona,
125
+ };
126
+ if (shouldLogDaytonaResources()) {
127
+ console.log(`[daytona:create] config runtime=${config.runtime ?? ""} purpose=${config.purpose ?? ""} params=${JSON.stringify(config.params ?? {})} snapshot=${config.daytona?.snapshot ?? ""} image=${config.daytona?.image ?? ""} ephemeral=${config.daytona?.ephemeral} autoStop=${config.daytona?.autoStopIntervalMin ?? ""} autoDelete=${config.daytona?.autoDeleteIntervalMin ?? ""} volumes=${JSON.stringify(config.daytona?.volumes ?? [])}`);
128
+ console.log(`[daytona:create] ts=${new Date(startedAt).toISOString()} startMs=${startedAt}`);
129
+ }
130
+ await logDaytonaResources("before_create");
131
+ const created = await service.createSandbox(config);
132
+ if (!created.ok) {
133
+ await logDaytonaResources("create_failed");
9
134
  throw new Error(created.error);
135
+ }
136
+ await logDaytonaResources("after_create");
137
+ if (shouldLogDaytonaResources()) {
138
+ const elapsedMs = Date.now() - startedAt;
139
+ console.log(`[daytona:create] doneMs=${Date.now()} elapsedMs=${elapsedMs}`);
140
+ }
141
+ if (shouldLogDaytonaResources()) {
142
+ try {
143
+ const info = await service.reconnectToSandbox(created.data.sandboxId);
144
+ if (info.ok && !info.data.sandbox.sandboxId) {
145
+ const sb = info.data.sandbox;
146
+ console.log(`[daytona:after_create] sandbox id=${sb?.id} state=${sb?.state} disk=${sb?.disk} cpu=${sb?.cpu} memory=${sb?.memory} autoDelete=${sb?.autoDeleteInterval} autoStop=${sb?.autoStopInterval}`);
147
+ }
148
+ }
149
+ catch (e) {
150
+ console.log(`[daytona:after_create] reconnect error: ${e instanceof Error ? e.message : String(e)}`);
151
+ }
152
+ try {
153
+ const df = await service.runCommand(created.data.sandboxId, "df", ["-h"]);
154
+ if (df.ok) {
155
+ console.log(`[sandbox:${created.data.sandboxId}] df -h\n${df.data.output}`);
156
+ }
157
+ else {
158
+ console.log(`[sandbox:${created.data.sandboxId}] df error: ${df.error}`);
159
+ }
160
+ }
161
+ catch (e) {
162
+ console.log(`[sandbox:${created.data.sandboxId}] df error: ${e instanceof Error ? e.message : String(e)}`);
163
+ }
164
+ try {
165
+ const mountPath = getDaytonaVolumeMountPath();
166
+ const basePath = getDatasetWorkdirBase();
167
+ const du = await service.runCommand(created.data.sandboxId, "du", ["-sh", mountPath, basePath]);
168
+ if (du.ok) {
169
+ console.log(`[sandbox:${created.data.sandboxId}] du -sh\n${du.data.output}`);
170
+ }
171
+ else {
172
+ console.log(`[sandbox:${created.data.sandboxId}] du error: ${du.error}`);
173
+ }
174
+ }
175
+ catch (e) {
176
+ console.log(`[sandbox:${created.data.sandboxId}] du error: ${e instanceof Error ? e.message : String(e)}`);
177
+ }
178
+ }
10
179
  return { sandboxId: created.data.sandboxId };
11
180
  }
12
181
  export async function runDatasetSandboxCommandStep(params) {
13
182
  "use step";
183
+ const startedAt = Date.now();
14
184
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
15
185
  const db = (await resolveStoryRuntime(params.env)).db;
16
186
  const { SandboxService } = (await import("@ekairos/sandbox"));
@@ -23,10 +193,15 @@ export async function runDatasetSandboxCommandStep(params) {
23
193
  stdout: result.data.output ?? "",
24
194
  stderr: result.data.error ?? "",
25
195
  };
196
+ if (shouldLogDaytonaResources()) {
197
+ const elapsedMs = Date.now() - startedAt;
198
+ console.log(`[daytona:cmd] sandboxId=${params.sandboxId} cmd=${params.cmd} args=${JSON.stringify(params.args ?? [])} elapsedMs=${elapsedMs}`);
199
+ }
26
200
  return normalized;
27
201
  }
28
202
  export async function writeDatasetSandboxFilesStep(params) {
29
203
  "use step";
204
+ const startedAt = Date.now();
30
205
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
31
206
  const db = (await resolveStoryRuntime(params.env)).db;
32
207
  const { SandboxService } = (await import("@ekairos/sandbox"));
@@ -34,6 +209,10 @@ export async function writeDatasetSandboxFilesStep(params) {
34
209
  const result = await service.writeFiles(params.sandboxId, params.files);
35
210
  if (!result.ok)
36
211
  throw new Error(result.error);
212
+ if (shouldLogDaytonaResources()) {
213
+ const elapsedMs = Date.now() - startedAt;
214
+ console.log(`[daytona:write] sandboxId=${params.sandboxId} files=${params.files.length} elapsedMs=${elapsedMs}`);
215
+ }
37
216
  }
38
217
  /**
39
218
  * Workflow-safe helper:
@@ -53,6 +232,7 @@ export async function writeDatasetSandboxTextFileStep(params) {
53
232
  }
54
233
  export async function readDatasetSandboxFileStep(params) {
55
234
  "use step";
235
+ const startedAt = Date.now();
56
236
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
57
237
  const db = (await resolveStoryRuntime(params.env)).db;
58
238
  const { SandboxService } = (await import("@ekairos/sandbox"));
@@ -60,6 +240,10 @@ export async function readDatasetSandboxFileStep(params) {
60
240
  const result = await service.readFile(params.sandboxId, params.path);
61
241
  if (!result.ok)
62
242
  throw new Error(result.error);
243
+ if (shouldLogDaytonaResources()) {
244
+ const elapsedMs = Date.now() - startedAt;
245
+ console.log(`[daytona:read] sandboxId=${params.sandboxId} path=${params.path} bytes=${result.data.contentBase64?.length ?? 0} elapsedMs=${elapsedMs}`);
246
+ }
63
247
  return result.data;
64
248
  }
65
249
  /**
@@ -77,6 +261,7 @@ export async function readDatasetSandboxTextFileStep(params) {
77
261
  }
78
262
  export async function stopDatasetSandboxStep(params) {
79
263
  "use step";
264
+ const startedAt = Date.now();
80
265
  const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
81
266
  const db = (await resolveStoryRuntime(params.env)).db;
82
267
  const { SandboxService } = (await import("@ekairos/sandbox"));
@@ -84,4 +269,8 @@ export async function stopDatasetSandboxStep(params) {
84
269
  const result = await service.stopSandbox(params.sandboxId);
85
270
  if (!result.ok)
86
271
  throw new Error(result.error);
272
+ if (shouldLogDaytonaResources()) {
273
+ const elapsedMs = Date.now() - startedAt;
274
+ console.log(`[daytona:stop] sandboxId=${params.sandboxId} elapsedMs=${elapsedMs}`);
275
+ }
87
276
  }
@@ -1,4 +1,5 @@
1
1
  import { type StructureRowsOutputPagingCursor } from "./rowsOutputPaging";
2
+ import type { SandboxConfig } from "@ekairos/sandbox";
2
3
  export type StructureSource = {
3
4
  kind: "file";
4
5
  fileId: string;
@@ -69,6 +70,7 @@ export declare function structure<Env extends {
69
70
  orgId: string;
70
71
  }>(env: Env, opts?: {
71
72
  datasetId?: string;
73
+ sandboxConfig?: SandboxConfig;
72
74
  }): {
73
75
  datasetId: string;
74
76
  from(...src: StructureSource[]): /*elided*/ any;
package/dist/structure.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { createStory, didToolExecute, USER_MESSAGE_TYPE, WEB_CHANNEL } from "@ekairos/story";
2
- import { getDatasetOutputPath, getDatasetOutputSchemaPath, getDatasetWorkstation } from "./datasetFiles";
2
+ import { getDatasetOutputPath, getDatasetOutputSchemaPath, getDatasetWorkstation, getDaytonaVolumeMountPath, getDaytonaVolumeName, } from "./datasetFiles";
3
3
  import { structureDownloadRowsOutputToSandboxStep, structureReadRowsOutputPageFromSandboxStep, } from "./rowsOutputPaging";
4
4
  import { structureSplitRowsOutputToDatasetStep } from "./rowsOutputSplit";
5
5
  import { createDatasetSandboxStep, readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFileStep, } from "./sandbox/steps";
@@ -50,6 +50,88 @@ function guessTextFileExtension(mimeType, name) {
50
50
  return ".yaml";
51
51
  return ".txt";
52
52
  }
53
+ function shouldSkipPipInstall() {
54
+ const explicit = String(process.env.STRUCTURE_DAYTONA_SKIP_PIP_INSTALL ?? "").trim().toLowerCase();
55
+ if (explicit === "1" || explicit === "true" || explicit === "yes")
56
+ return true;
57
+ const declarative = String(process.env.STRUCTURE_DAYTONA_DECLARATIVE_IMAGE ?? "").trim().toLowerCase();
58
+ if (declarative === "1" || declarative === "true" || declarative === "yes")
59
+ return true;
60
+ const snapshot = String(process.env.STRUCTURE_DAYTONA_SNAPSHOT ?? "").trim();
61
+ return Boolean(snapshot);
62
+ }
63
+ function getDefaultSandboxConfig(datasetId) {
64
+ const volumeName = getDaytonaVolumeName();
65
+ const mountPath = getDaytonaVolumeMountPath();
66
+ const volumes = volumeName && mountPath
67
+ ? [
68
+ {
69
+ volumeName,
70
+ mountPath,
71
+ },
72
+ ]
73
+ : [];
74
+ return {
75
+ provider: "daytona",
76
+ runtime: "python3.13",
77
+ timeoutMs: 10 * 60 * 1000,
78
+ purpose: "structure.dataset",
79
+ params: { datasetId },
80
+ daytona: {
81
+ ephemeral: true,
82
+ autoStopIntervalMin: 5,
83
+ volumes,
84
+ },
85
+ };
86
+ }
87
+ function mergeSandboxConfig(base, override) {
88
+ if (!override)
89
+ return base;
90
+ const mergedParams = {
91
+ ...(base.params ?? {}),
92
+ ...(override.params ?? {}),
93
+ };
94
+ const mergedDaytona = {
95
+ ...(base.daytona ?? {}),
96
+ ...(override.daytona ?? {}),
97
+ };
98
+ if (override.daytona && "volumes" in override.daytona) {
99
+ mergedDaytona.volumes = override.daytona?.volumes;
100
+ }
101
+ return {
102
+ ...base,
103
+ ...override,
104
+ params: mergedParams,
105
+ daytona: mergedDaytona,
106
+ };
107
+ }
108
+ async function sandboxFileExists(env, sandboxId, path) {
109
+ const res = await runDatasetSandboxCommandStep({
110
+ env,
111
+ sandboxId,
112
+ cmd: "test",
113
+ args: ["-f", path],
114
+ });
115
+ return res.exitCode === 0;
116
+ }
117
+ async function sandboxFindFirstMatch(env, sandboxId, pattern) {
118
+ const py = [
119
+ "import sys, glob",
120
+ "pattern = sys.argv[1]",
121
+ "matches = glob.glob(pattern)",
122
+ "print(matches[0] if matches else '')",
123
+ ].join("\n");
124
+ const res = await runDatasetSandboxCommandStep({
125
+ env,
126
+ sandboxId,
127
+ cmd: "python",
128
+ args: ["-c", py, pattern],
129
+ });
130
+ if (res.exitCode !== 0)
131
+ return null;
132
+ const out = String(res.stdout ?? "").trim();
133
+ return out ? out : null;
134
+ }
53
135
  async function ensureSandboxPrepared(params) {
54
136
  const { env, datasetId, sandboxId, sources, state } = params;
55
137
  const workstation = getDatasetWorkstation(datasetId);
@@ -60,25 +142,33 @@ async function ensureSandboxPrepared(params) {
60
142
  const mkdirRes = await runDatasetSandboxCommandStep({ env, sandboxId, cmd: "mkdir", args: ["-p", workstation] });
61
143
  // Align with dataset sandbox behavior: install python deps up-front (once per dataset sandbox).
62
144
  // This avoids tool-level "install if used" heuristics and ensures scripts can import pandas.
63
- const pipInstall = await runDatasetSandboxCommandStep({
64
- env,
65
- sandboxId,
66
- cmd: "python",
67
- // NOTE: pandas needs openpyxl to read .xlsx files.
68
- args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
69
- });
70
- const installStderr = pipInstall.stderr ?? "";
71
- if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
72
- throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
145
+ if (!shouldSkipPipInstall()) {
146
+ const pipInstall = await runDatasetSandboxCommandStep({
147
+ env,
148
+ sandboxId,
149
+ cmd: "python",
150
+ // NOTE: pandas needs openpyxl to read .xlsx files.
151
+ args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
152
+ });
153
+ const installStderr = pipInstall.stderr ?? "";
154
+ if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
155
+ throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
156
+ }
73
157
  }
74
158
  const prepared = [];
75
159
  for (let i = 0; i < sources.length; i++) {
76
160
  const src = sources[i];
77
161
  if (src.kind === "file") {
162
+ const basePath = `${workstation}/file_${i}_${src.fileId}`;
163
+ const existingPath = await sandboxFindFirstMatch(env, sandboxId, `${basePath}*`);
164
+ if (existingPath) {
165
+ prepared.push({ kind: "file", id: src.fileId, path: existingPath });
166
+ continue;
167
+ }
78
168
  const file = await readInstantFileStep({ env, fileId: src.fileId });
79
169
  const fileName = String(file.contentDisposition ?? "");
80
170
  const ext = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
81
- const path = `${workstation}/file_${i}_${src.fileId}${ext}`;
171
+ const path = `${basePath}${ext}`;
82
172
  await writeDatasetSandboxFilesStep({
83
173
  env,
84
174
  sandboxId,
@@ -88,16 +178,19 @@ async function ensureSandboxPrepared(params) {
88
178
  continue;
89
179
  }
90
180
  if (src.kind === "dataset") {
91
- const content = await structureReadRowsOutputJsonlStep({ env, structureId: src.datasetId });
92
- if (!content.ok) {
93
- throw new Error(content.error);
94
- }
95
181
  const path = `${workstation}/dataset_${src.datasetId}.jsonl`;
96
- await writeDatasetSandboxFilesStep({
97
- env,
98
- sandboxId,
99
- files: [{ path, contentBase64: content.data.contentBase64 }],
100
- });
182
+ const exists = await sandboxFileExists(env, sandboxId, path);
183
+ if (!exists) {
184
+ const content = await structureReadRowsOutputJsonlStep({ env, structureId: src.datasetId });
185
+ if (!content.ok) {
186
+ throw new Error(content.error);
187
+ }
188
+ await writeDatasetSandboxFilesStep({
189
+ env,
190
+ sandboxId,
191
+ files: [{ path, contentBase64: content.data.contentBase64 }],
192
+ });
193
+ }
101
194
  prepared.push({ kind: "dataset", id: src.datasetId, path });
102
195
  continue;
103
196
  }
@@ -105,7 +198,10 @@ async function ensureSandboxPrepared(params) {
105
198
  const ext = guessTextFileExtension(src.mimeType, src.name);
106
199
  const textId = `text_${i}`;
107
200
  const path = `${workstation}/${textId}${ext}`;
108
- await writeDatasetSandboxTextFileStep({ env, sandboxId, path, text: String(src.text ?? "") });
201
+ const exists = await sandboxFileExists(env, sandboxId, path);
202
+ if (!exists) {
203
+ await writeDatasetSandboxTextFileStep({ env, sandboxId, path, text: String(src.text ?? "") });
204
+ }
109
205
  prepared.push({ kind: "text", id: textId, path, name: src.name, mimeType: src.mimeType });
110
206
  continue;
111
207
  }
@@ -149,6 +245,8 @@ async function readSchemaFromSandboxIfPresent(params) {
149
245
  function createStructureStoryDefinition(config) {
150
246
  const datasetId = config.datasetId;
151
247
  const model = config.model ?? "openai/gpt-5.2";
248
+ const defaultSandboxConfig = getDefaultSandboxConfig(datasetId);
249
+ const resolvedSandboxConfig = mergeSandboxConfig(defaultSandboxConfig, config.sandboxConfig);
152
250
  const story = createStory("ekairos.structure")
153
251
  .context(async (stored, env) => {
154
252
  const prev = stored?.content ?? {};
@@ -156,7 +254,10 @@ function createStructureStoryDefinition(config) {
156
254
  const existingSandboxId = prev.sandboxId ?? config.sandboxId ?? "";
157
255
  let sandboxId = existingSandboxId;
158
256
  if (!sandboxId) {
159
- const created = await createDatasetSandboxStep({ env, runtime: "python3.13", timeoutMs: 10 * 60 * 1000 });
257
+ const created = await createDatasetSandboxStep({
258
+ env,
259
+ ...resolvedSandboxConfig,
260
+ });
160
261
  sandboxId = created.sandboxId;
161
262
  }
162
263
  const { preparedSources, workstation, outputPath } = await ensureSandboxPrepared({
@@ -176,6 +277,13 @@ function createStructureStoryDefinition(config) {
176
277
  sources: preparedSources,
177
278
  workstation,
178
279
  outputPath,
280
+ sandboxProvider: resolvedSandboxConfig.provider ?? "daytona",
281
+ sandboxRuntime: resolvedSandboxConfig.runtime ?? "python3.13",
282
+ sandboxEphemeral: resolvedSandboxConfig.daytona?.ephemeral ?? true,
283
+ sandboxVolumeName: resolvedSandboxConfig.daytona?.volumes?.[0]?.volumeName,
284
+ sandboxVolumeMountPath: resolvedSandboxConfig.daytona?.volumes?.[0]?.mountPath,
285
+ sandboxSnapshot: resolvedSandboxConfig.daytona?.snapshot,
286
+ sandboxImage: resolvedSandboxConfig.daytona?.image,
179
287
  };
180
288
  const contextKey = `structure:${datasetId}`;
181
289
  // IMPORTANT:
@@ -274,6 +382,7 @@ export function structure(env, opts) {
274
382
  let mode = "auto";
275
383
  let output = "rows";
276
384
  let outputSchema;
385
+ const sandboxConfig = opts?.sandboxConfig;
277
386
  const api = {
278
387
  datasetId,
279
388
  from(...src) {
@@ -314,6 +423,7 @@ export function structure(env, opts) {
314
423
  mode,
315
424
  output,
316
425
  outputSchema,
426
+ sandboxConfig,
317
427
  };
318
428
  const { story } = createStructureStoryDefinition(storyConfig);
319
429
  function makeUserMessageEvent(text) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/structure",
3
- "version": "1.21.74-beta.0",
3
+ "version": "1.21.77-beta.0",
4
4
  "description": "Ekairos Structure - Unified structured extraction (rows or object) from file/text/dataset inputs",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -36,7 +36,7 @@
36
36
  "typecheck": "tsc --noEmit"
37
37
  },
38
38
  "dependencies": {
39
- "@ekairos/domain": "^1.21.74-beta.0",
39
+ "@ekairos/domain": "^1.21.77-beta.0",
40
40
  "@ekairos/sandbox": "^1.21.60-beta.0",
41
41
  "@instantdb/admin": "^0.22.13",
42
42
  "@instantdb/core": "^0.22.13",
@@ -1,16 +0,0 @@
1
- import { type StructureRowsOutputPagingCursor, type StructureRowsOutputSandboxRef } from "./rowsOutputPaging";
2
- export declare function datasetReader(env: any, opts: {
3
- datasetId: string;
4
- sandboxId?: string;
5
- runtime?: string;
6
- timeoutMs?: number;
7
- }): {
8
- datasetId: string;
9
- download(): Promise<StructureRowsOutputSandboxRef>;
10
- readPage(params: {
11
- sandboxId: string;
12
- localPath: string;
13
- cursor?: Partial<StructureRowsOutputPagingCursor>;
14
- limit: number;
15
- }): Promise<import("./rowsOutputPaging").StructureReadRowsOutputPageFromSandboxResult>;
16
- };
@@ -1,25 +0,0 @@
1
- import { structureDownloadRowsOutputToSandboxStep, structureReadRowsOutputPageFromSandboxStep, } from "./rowsOutputPaging";
2
- export function datasetReader(env, opts) {
3
- const datasetId = opts.datasetId;
4
- return {
5
- datasetId,
6
- async download() {
7
- return await structureDownloadRowsOutputToSandboxStep({
8
- env,
9
- structureId: datasetId,
10
- sandboxId: opts.sandboxId,
11
- runtime: opts.runtime,
12
- timeoutMs: opts.timeoutMs,
13
- });
14
- },
15
- async readPage(params) {
16
- return await structureReadRowsOutputPageFromSandboxStep({
17
- env,
18
- sandboxId: params.sandboxId,
19
- localPath: params.localPath,
20
- cursor: params.cursor,
21
- limit: params.limit,
22
- });
23
- },
24
- };
25
- }
@@ -1,37 +0,0 @@
1
- export type RowsOutputSandboxDownloadResult = {
2
- sandboxId: string;
3
- localPath: string;
4
- };
5
- export type RowsOutputChunkResult<T = any> = {
6
- rows: T[];
7
- nextByteOffset: number;
8
- nextRowOffset: number;
9
- done: boolean;
10
- };
11
- /**
12
- * Step 1/2:
13
- * Download the rows output.jsonl from Instant storage into a sandbox file.
14
- *
15
- * This isolates network flakiness (e.g. undici `TypeError: terminated`) into a single step
16
- * and makes subsequent reads purely sandbox-local.
17
- */
18
- export declare function structureDownloadRowsOutputToSandboxStep(params: {
19
- env: any;
20
- structureId: string;
21
- runtime?: string;
22
- timeoutMs?: number;
23
- }): Promise<RowsOutputSandboxDownloadResult>;
24
- /**
25
- * Step 2/2:
26
- * Read the next chunk of ROW records from the sandbox-local output.jsonl, bounded by `limit`.
27
- *
28
- * Pagination state is passed explicitly via `{ byteOffset, rowOffset }` and returned as next offsets.
29
- */
30
- export declare function structureReadRowsOutputChunkStep<T = any>(params: {
31
- env: any;
32
- sandboxId: string;
33
- localPath: string;
34
- byteOffset: number;
35
- rowOffset: number;
36
- limit: number;
37
- }): Promise<RowsOutputChunkResult<T>>;
@@ -1,125 +0,0 @@
1
- import { getDatasetOutputPath, getDatasetWorkstation } from "./datasetFiles";
2
- import { createDatasetSandboxStep, runDatasetSandboxCommandStep } from "./sandbox/steps";
3
- import { getStoryRuntime } from "./runtime";
4
- /**
5
- * Step 1/2:
6
- * Download the rows output.jsonl from Instant storage into a sandbox file.
7
- *
8
- * This isolates network flakiness (e.g. undici `TypeError: terminated`) into a single step
9
- * and makes subsequent reads purely sandbox-local.
10
- */
11
- export async function structureDownloadRowsOutputToSandboxStep(params) {
12
- "use step";
13
- const runtime = params.runtime ?? "python3.13";
14
- const timeoutMs = params.timeoutMs ?? 10 * 60 * 1000;
15
- const { sandboxId } = await createDatasetSandboxStep({
16
- env: params.env,
17
- runtime,
18
- timeoutMs,
19
- purpose: "structure.rows-output.reader",
20
- params: { structureId: params.structureId },
21
- });
22
- const workstation = getDatasetWorkstation(params.structureId);
23
- const localPath = getDatasetOutputPath(params.structureId);
24
- await runDatasetSandboxCommandStep({
25
- env: params.env,
26
- sandboxId,
27
- cmd: "mkdir",
28
- args: ["-p", workstation],
29
- });
30
- const storyRuntime = await getStoryRuntime(params.env);
31
- const db = storyRuntime.db;
32
- const contextKey = `structure:${params.structureId}`;
33
- const query = (await db.query({
34
- context_contexts: {
35
- $: { where: { key: contextKey }, limit: 1 },
36
- structure_output_file: {},
37
- },
38
- }));
39
- const ctx = query.context_contexts?.[0];
40
- const linked = Array.isArray(ctx?.structure_output_file) ? ctx.structure_output_file[0] : ctx.structure_output_file;
41
- const url = linked?.url;
42
- if (!url) {
43
- throw new Error("Rows output file not found");
44
- }
45
- const py = [
46
- "import sys, urllib.request",
47
- "url = sys.argv[1]",
48
- "out_path = sys.argv[2]",
49
- "with urllib.request.urlopen(url) as r:",
50
- " data = r.read()",
51
- "with open(out_path, 'wb') as f:",
52
- " f.write(data)",
53
- "print('ok', len(data))",
54
- ].join("\n");
55
- const res = await runDatasetSandboxCommandStep({
56
- env: params.env,
57
- sandboxId,
58
- cmd: "python",
59
- args: ["-c", py, String(url), localPath],
60
- });
61
- if (res.exitCode !== 0) {
62
- throw new Error(res.stderr || "Failed to download rows output to sandbox");
63
- }
64
- return { sandboxId, localPath };
65
- }
66
- /**
67
- * Step 2/2:
68
- * Read the next chunk of ROW records from the sandbox-local output.jsonl, bounded by `limit`.
69
- *
70
- * Pagination state is passed explicitly via `{ byteOffset, rowOffset }` and returned as next offsets.
71
- */
72
- export async function structureReadRowsOutputChunkStep(params) {
73
- "use step";
74
- const py = [
75
- "import sys, json",
76
- "path = sys.argv[1]",
77
- "byte_offset = int(sys.argv[2])",
78
- "row_offset = int(sys.argv[3])",
79
- "limit = int(sys.argv[4])",
80
- "rows = []",
81
- "next_byte = byte_offset",
82
- "next_row = row_offset",
83
- "with open(path, 'rb') as f:",
84
- " f.seek(byte_offset)",
85
- " while len(rows) < limit:",
86
- " line = f.readline()",
87
- " if not line:",
88
- " break",
89
- " next_byte = f.tell()",
90
- " try:",
91
- " obj = json.loads(line.decode('utf-8'))",
92
- " except Exception:",
93
- " continue",
94
- " if obj.get('type') != 'row':",
95
- " continue",
96
- " rows.append(obj.get('data'))",
97
- " next_row += 1",
98
- "done = len(rows) < limit",
99
- "print(json.dumps({",
100
- " 'rows': rows,",
101
- " 'nextByteOffset': next_byte,",
102
- " 'nextRowOffset': next_row,",
103
- " 'done': done,",
104
- "}))",
105
- ].join("\n");
106
- const res = await runDatasetSandboxCommandStep({
107
- env: params.env,
108
- sandboxId: params.sandboxId,
109
- cmd: "python",
110
- args: [
111
- "-c",
112
- py,
113
- params.localPath,
114
- String(params.byteOffset ?? 0),
115
- String(params.rowOffset ?? 0),
116
- String(params.limit),
117
- ],
118
- });
119
- if (res.exitCode !== 0) {
120
- throw new Error(res.stderr || "Failed to read rows chunk from sandbox");
121
- }
122
- const out = String(res.stdout ?? "").trim();
123
- const parsed = JSON.parse(out);
124
- return parsed;
125
- }
@@ -1,59 +0,0 @@
1
- import { type DatasetSandboxId } from "./sandbox/steps";
2
- /**
3
- * Step 1: Download Structure rows output file (output.jsonl) into a sandbox.
4
- *
5
- * This enables pagination by reading chunks from the local sandbox filesystem.
6
- */
7
- export declare function downloadStructureRowsOutputToSandboxStep(params: {
8
- env: any;
9
- sandboxId: DatasetSandboxId;
10
- structureId: string;
11
- }): Promise<{
12
- filePath: string;
13
- }>;
14
- /**
15
- * @deprecated Prefer `downloadStructureRowsOutputToSandboxStep` (kept for symmetry with chunk naming).
16
- *
17
- * Note: The name includes `RowsOutputJsonl` to be explicit about the file format.
18
- */
19
- export declare function structureDownloadRowsOutputJsonlToSandboxStep(params: {
20
- env: any;
21
- sandboxId: DatasetSandboxId;
22
- structureId: string;
23
- }): Promise<{
24
- filePath: string;
25
- }>;
26
- /**
27
- * Step 2: Read a chunk/page of JSONL records from the downloaded sandbox file.
28
- *
29
- * Naming (consistent):
30
- * - `structureReadRowsOutputJsonlStep` reads the whole file (base64)
31
- * - `structureReadRowsOutputJsonlChunkStep` reads a paginated chunk from sandbox
32
- *
33
- * Offset/limit are line-based (0-indexed).
34
- */
35
- export declare function structureReadRowsOutputJsonlChunkStep(params: {
36
- env: any;
37
- sandboxId: DatasetSandboxId;
38
- structureId: string;
39
- offset: number;
40
- limit: number;
41
- }): Promise<{
42
- records: any[];
43
- nextOffset: number;
44
- done: boolean;
45
- }>;
46
- /**
47
- * @deprecated Use `structureReadRowsOutputJsonlChunkStep` instead (naming consistency).
48
- */
49
- export declare function readStructureRowsChunkFromSandboxStep(params: {
50
- env: any;
51
- sandboxId: DatasetSandboxId;
52
- structureId: string;
53
- offset: number;
54
- limit: number;
55
- }): Promise<{
56
- records: any[];
57
- nextOffset: number;
58
- done: boolean;
59
- }>;
@@ -1,190 +0,0 @@
1
- import { getDatasetWorkstation } from "./datasetFiles";
2
- import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFileStep, } from "./sandbox/steps";
3
- async function getRowsOutputUrl(params) {
4
- const { resolveStoryRuntime } = await import("@ekairos/story/runtime");
5
- const runtime = (await resolveStoryRuntime(params.env));
6
- const db = runtime.db;
7
- const contextKey = `structure:${params.structureId}`;
8
- const query = (await db.query({
9
- context_contexts: {
10
- $: { where: { key: contextKey }, limit: 1 },
11
- structure_output_file: {},
12
- },
13
- }));
14
- const ctx = query.context_contexts?.[0];
15
- const linked = Array.isArray(ctx?.structure_output_file) ? ctx.structure_output_file[0] : ctx?.structure_output_file;
16
- const url = linked?.url;
17
- if (!url) {
18
- throw new Error("Rows output file not found");
19
- }
20
- return String(url);
21
- }
22
- /**
23
- * Step 1: Download Structure rows output file (output.jsonl) into a sandbox.
24
- *
25
- * This enables pagination by reading chunks from the local sandbox filesystem.
26
- */
27
- export async function downloadStructureRowsOutputToSandboxStep(params) {
28
- "use step";
29
- const workstation = getDatasetWorkstation(params.structureId);
30
- const filePath = `${workstation}/rows_output.jsonl`;
31
- const scriptPath = `${workstation}/download_rows_output.py`;
32
- // Ensure directory exists
33
- await runDatasetSandboxCommandStep({
34
- env: params.env,
35
- sandboxId: params.sandboxId,
36
- cmd: "mkdir",
37
- args: ["-p", workstation],
38
- });
39
- const url = await getRowsOutputUrl({ env: params.env, structureId: params.structureId });
40
- // Write a deterministic script to the sandbox and run it (no external deps).
41
- await writeDatasetSandboxTextFileStep({
42
- env: params.env,
43
- sandboxId: params.sandboxId,
44
- path: scriptPath,
45
- text: [
46
- "import argparse",
47
- "import urllib.request",
48
- "",
49
- "def main():",
50
- " p = argparse.ArgumentParser()",
51
- " p.add_argument('--url', required=True)",
52
- " p.add_argument('--out', required=True)",
53
- " args = p.parse_args()",
54
- "",
55
- " # Download to local sandbox file",
56
- " with urllib.request.urlopen(args.url, timeout=60) as r:",
57
- " data = r.read()",
58
- " with open(args.out, 'wb') as f:",
59
- " f.write(data)",
60
- "",
61
- " print('ok')",
62
- " print('bytes', len(data))",
63
- "",
64
- "if __name__ == '__main__':",
65
- " main()",
66
- "",
67
- ].join("\n"),
68
- });
69
- const res = await runDatasetSandboxCommandStep({
70
- env: params.env,
71
- sandboxId: params.sandboxId,
72
- cmd: "python",
73
- args: [scriptPath, "--url", url, "--out", filePath],
74
- });
75
- if (res.exitCode !== 0) {
76
- throw new Error(res.stderr || "Failed to download rows output into sandbox");
77
- }
78
- return { filePath };
79
- }
80
- /**
81
- * @deprecated Prefer `downloadStructureRowsOutputToSandboxStep` (kept for symmetry with chunk naming).
82
- *
83
- * Note: The name includes `RowsOutputJsonl` to be explicit about the file format.
84
- */
85
- export async function structureDownloadRowsOutputJsonlToSandboxStep(params) {
86
- "use step";
87
- return await downloadStructureRowsOutputToSandboxStep(params);
88
- }
89
- /**
90
- * Step 2: Read a chunk/page of JSONL records from the downloaded sandbox file.
91
- *
92
- * Naming (consistent):
93
- * - `structureReadRowsOutputJsonlStep` reads the whole file (base64)
94
- * - `structureReadRowsOutputJsonlChunkStep` reads a paginated chunk from sandbox
95
- *
96
- * Offset/limit are line-based (0-indexed).
97
- */
98
- export async function structureReadRowsOutputJsonlChunkStep(params) {
99
- "use step";
100
- const workstation = getDatasetWorkstation(params.structureId);
101
- const filePath = `${workstation}/rows_output.jsonl`;
102
- const scriptPath = `${workstation}/read_rows_chunk.py`;
103
- await writeDatasetSandboxTextFileStep({
104
- env: params.env,
105
- sandboxId: params.sandboxId,
106
- path: scriptPath,
107
- text: [
108
- "import argparse",
109
- "import json",
110
- "",
111
- "def main():",
112
- " p = argparse.ArgumentParser()",
113
- " p.add_argument('--path', required=True)",
114
- " p.add_argument('--offset', type=int, required=True)",
115
- " p.add_argument('--limit', type=int, required=True)",
116
- " args = p.parse_args()",
117
- "",
118
- " records = []",
119
- " current = 0",
120
- " taken = 0",
121
- " done = True",
122
- "",
123
- " with open(args.path, 'r', encoding='utf-8', errors='replace') as f:",
124
- " for line in f:",
125
- " if current < args.offset:",
126
- " current += 1",
127
- " continue",
128
- " if taken >= args.limit:",
129
- " done = False",
130
- " break",
131
- " line = line.strip()",
132
- " if not line:",
133
- " current += 1",
134
- " continue",
135
- " try:",
136
- " records.append(json.loads(line))",
137
- " except Exception:",
138
- " # Skip invalid JSON line",
139
- " pass",
140
- " taken += 1",
141
- " current += 1",
142
- "",
143
- " out = {",
144
- " 'records': records,",
145
- " 'nextOffset': args.offset + taken,",
146
- " 'done': done,",
147
- " }",
148
- " print(json.dumps(out, ensure_ascii=False))",
149
- "",
150
- "if __name__ == '__main__':",
151
- " main()",
152
- "",
153
- ].join("\n"),
154
- });
155
- const res = await runDatasetSandboxCommandStep({
156
- env: params.env,
157
- sandboxId: params.sandboxId,
158
- cmd: "python",
159
- args: [
160
- scriptPath,
161
- "--path",
162
- filePath,
163
- "--offset",
164
- String(Math.max(0, params.offset ?? 0)),
165
- "--limit",
166
- String(Math.max(1, params.limit ?? 1)),
167
- ],
168
- });
169
- if (res.exitCode !== 0) {
170
- throw new Error(res.stderr || "Failed to read rows chunk from sandbox");
171
- }
172
- const text = (res.stdout ?? "").trim();
173
- if (!text) {
174
- return { records: [], nextOffset: params.offset, done: true };
175
- }
176
- // The script prints a single JSON object.
177
- const parsed = JSON.parse(text);
178
- return {
179
- records: Array.isArray(parsed?.records) ? parsed.records : [],
180
- nextOffset: Number(parsed?.nextOffset ?? params.offset),
181
- done: Boolean(parsed?.done),
182
- };
183
- }
184
- /**
185
- * @deprecated Use `structureReadRowsOutputJsonlChunkStep` instead (naming consistency).
186
- */
187
- export async function readStructureRowsChunkFromSandboxStep(params) {
188
- "use step";
189
- return await structureReadRowsOutputJsonlChunkStep(params);
190
- }