@ekairos/structure 1.21.62-beta.0 → 1.21.67-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ import type { StructureRowsOutputPagingCursor } from "./rowsOutputPaging";
2
+ export type StructureSplitRowsOutputToDatasetResult = {
3
+ datasetId?: string;
4
+ rowsWritten: number;
5
+ nextCursor: StructureRowsOutputPagingCursor;
6
+ done: boolean;
7
+ };
8
+ /**
9
+ * Step:
10
+ * Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
11
+ *
12
+ * Key property:
13
+ * - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
14
+ *
15
+ * This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
16
+ * through workflow/step params.
17
+ */
18
+ export declare function structureSplitRowsOutputToDatasetStep(params: {
19
+ env: any;
20
+ sandboxId: string;
21
+ localPath: string;
22
+ cursor?: Partial<StructureRowsOutputPagingCursor>;
23
+ limit: number;
24
+ childDatasetId: string;
25
+ }): Promise<StructureSplitRowsOutputToDatasetResult>;
@@ -0,0 +1,131 @@
1
+ import { getDatasetOutputPath, getDatasetWorkstation } from "./datasetFiles";
2
+ import { readDatasetSandboxFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps";
3
+ import { getStoryRuntime } from "./runtime";
4
+ /**
5
+ * Step:
6
+ * Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
7
+ *
8
+ * Key property:
9
+ * - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
10
+ *
11
+ * This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
12
+ * through workflow/step params.
13
+ */
14
+ export async function structureSplitRowsOutputToDatasetStep(params) {
15
+ "use step";
16
+ const byteOffset = params.cursor?.byteOffset ?? 0;
17
+ const rowOffset = params.cursor?.rowOffset ?? 0;
18
+ const workstation = getDatasetWorkstation(params.childDatasetId);
19
+ const outPath = getDatasetOutputPath(params.childDatasetId);
20
+ await runDatasetSandboxCommandStep({
21
+ env: params.env,
22
+ sandboxId: params.sandboxId,
23
+ cmd: "mkdir",
24
+ args: ["-p", workstation],
25
+ });
26
+ // Read from parent jsonl and write a child jsonl containing only ROW records, preserving `{ type, data }` lines.
27
+ const py = [
28
+ "import sys, json",
29
+ "in_path = sys.argv[1]",
30
+ "out_path = sys.argv[2]",
31
+ "byte_offset = int(sys.argv[3])",
32
+ "row_offset = int(sys.argv[4])",
33
+ "limit = int(sys.argv[5])",
34
+ "rows_written = 0",
35
+ "next_byte = byte_offset",
36
+ "next_row = row_offset",
37
+ "with open(in_path, 'rb') as f_in:",
38
+ " f_in.seek(byte_offset)",
39
+ " with open(out_path, 'wb') as f_out:",
40
+ " while rows_written < limit:",
41
+ " line = f_in.readline()",
42
+ " if not line:",
43
+ " break",
44
+ " next_byte = f_in.tell()",
45
+ " try:",
46
+ " obj = json.loads(line.decode('utf-8'))",
47
+ " except Exception:",
48
+ " continue",
49
+ " if obj.get('type') != 'row':",
50
+ " continue",
51
+ " f_out.write(line if line.endswith(b'\\n') else (line + b'\\n'))",
52
+ " rows_written += 1",
53
+ " next_row += 1",
54
+ "done = rows_written < limit",
55
+ "print(json.dumps({",
56
+ " 'rowsWritten': rows_written,",
57
+ " 'nextByteOffset': next_byte,",
58
+ " 'nextRowOffset': next_row,",
59
+ " 'done': done,",
60
+ "}))",
61
+ ].join("\n");
62
+ const res = await runDatasetSandboxCommandStep({
63
+ env: params.env,
64
+ sandboxId: params.sandboxId,
65
+ cmd: "python",
66
+ args: ["-c", py, params.localPath, outPath, String(byteOffset), String(rowOffset), String(params.limit)],
67
+ });
68
+ if (res.exitCode !== 0) {
69
+ throw new Error(res.stderr || "Failed to split rows output to dataset");
70
+ }
71
+ const parsed = JSON.parse(String(res.stdout ?? "").trim());
72
+ const rowsWritten = Number(parsed?.rowsWritten ?? 0);
73
+ const nextCursor = {
74
+ byteOffset: Number(parsed?.nextByteOffset ?? byteOffset),
75
+ rowOffset: Number(parsed?.nextRowOffset ?? rowOffset),
76
+ };
77
+ const done = Boolean(parsed?.done);
78
+ // No work to persist: return only paging state.
79
+ if (rowsWritten <= 0) {
80
+ return { datasetId: undefined, rowsWritten: 0, nextCursor, done: true };
81
+ }
82
+ const fileRes = await readDatasetSandboxFileStep({
83
+ env: params.env,
84
+ sandboxId: params.sandboxId,
85
+ path: outPath,
86
+ });
87
+ const storyRuntime = await getStoryRuntime(params.env);
88
+ const db = storyRuntime.db;
89
+ const store = storyRuntime.store;
90
+ const storagePath = `/structure/${params.childDatasetId}/output.jsonl`;
91
+ const fileBuffer = Buffer.from(fileRes.contentBase64 ?? "", "base64");
92
+ const uploadResult = await db.storage.uploadFile(storagePath, fileBuffer, {
93
+ contentType: "application/x-ndjson",
94
+ contentDisposition: "output.jsonl",
95
+ });
96
+ const fileId = uploadResult?.data?.id;
97
+ if (!fileId)
98
+ throw new Error("Failed to upload child dataset output file to storage");
99
+ const contextKey = `structure:${params.childDatasetId}`;
100
+ const ctx = await store.getOrCreateContext({ key: contextKey });
101
+ const ctxId = ctx?.id;
102
+ if (!ctxId)
103
+ throw new Error("Failed to create child dataset context");
104
+ // Link the output file to the context (used by DatasetService.readRecordsFromFile).
105
+ await db.transact([db.tx.context_contexts[ctxId].link({ structure_output_file: fileId })]);
106
+ // Patch metadata under `structure` namespace (never clobber Story runtime keys).
107
+ const existingContent = (ctx?.content ?? {});
108
+ const existingStructure = (existingContent?.structure ?? {});
109
+ const updatedAt = Date.now();
110
+ await store.updateContextContent({ key: contextKey }, {
111
+ ...existingContent,
112
+ structure: {
113
+ ...existingStructure,
114
+ kind: "ekairos.structure",
115
+ version: Number(existingStructure?.version ?? 1),
116
+ structureId: params.childDatasetId,
117
+ output: "rows",
118
+ updatedAt,
119
+ outputs: {
120
+ ...(existingStructure?.outputs ?? {}),
121
+ rows: {
122
+ format: "jsonl",
123
+ fileId,
124
+ storagePath,
125
+ rowCount: rowsWritten,
126
+ },
127
+ },
128
+ },
129
+ });
130
+ return { datasetId: params.childDatasetId, rowsWritten, nextCursor, done };
131
+ }
@@ -18,6 +18,16 @@ export type StructureRowsReadResult = {
18
18
  cursor: StructureRowsOutputPagingCursor;
19
19
  done: boolean;
20
20
  };
21
+ export type StructureRowsSplitResult = {
22
+ /**
23
+ * Child datasetId containing a JSONL `output.jsonl` with up to `limit` ROW entries.
24
+ * Omitted when there are no more rows to split.
25
+ */
26
+ datasetId?: string;
27
+ rowsWritten: number;
28
+ cursor: StructureRowsOutputPagingCursor;
29
+ done: boolean;
30
+ };
21
31
  export type StructureRowsReader = {
22
32
  /**
23
33
  * Workflow-friendly rows reader.
@@ -33,6 +43,18 @@ export type StructureRowsReader = {
33
43
  cursor?: Partial<StructureRowsOutputPagingCursor>;
34
44
  limit?: number;
35
45
  }): Promise<StructureRowsReadResult>;
46
+ /**
47
+ * Split the rows output into a child dataset (jsonl) and return paging state.
48
+ *
49
+ * Unlike `read()`, this does not return `rows[]` (avoids moving payloads through params/results).
50
+ */
51
+ split(): Promise<StructureRowsSplitResult>;
52
+ split(cursor?: Partial<StructureRowsOutputPagingCursor>, limit?: number): Promise<StructureRowsSplitResult>;
53
+ split(params?: {
54
+ cursor?: Partial<StructureRowsOutputPagingCursor>;
55
+ limit?: number;
56
+ datasetId?: string;
57
+ }): Promise<StructureRowsSplitResult>;
36
58
  };
37
59
  export type StructureBuildResult = {
38
60
  datasetId: string;
package/dist/structure.js CHANGED
@@ -1,9 +1,11 @@
1
1
  import { createStory, didToolExecute, USER_MESSAGE_TYPE, WEB_CHANNEL } from "@ekairos/story";
2
2
  import { getDatasetOutputPath, getDatasetOutputSchemaPath, getDatasetWorkstation } from "./datasetFiles";
3
3
  import { structureDownloadRowsOutputToSandboxStep, structureReadRowsOutputPageFromSandboxStep, } from "./rowsOutputPaging";
4
+ import { structureSplitRowsOutputToDatasetStep } from "./rowsOutputSplit";
4
5
  import { createDatasetSandboxStep, readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFileStep, } from "./sandbox/steps";
5
6
  import { readInstantFileStep } from "./file/steps";
6
7
  import { structureGetContextStep, structureGetContextWithRowsOutputFileStep, structureReadRowsOutputJsonlStep, } from "./dataset/steps";
8
+ import { getWorkflowMetadata } from "workflow";
7
9
  import { buildStructurePrompt } from "./prompts";
8
10
  import { createExecuteCommandTool } from "./executeCommand.tool";
9
11
  import { createGenerateSchemaTool } from "./generateSchema.tool";
@@ -20,6 +22,21 @@ function createUuidV4() {
20
22
  return v.toString(16);
21
23
  });
22
24
  }
25
+ function assertRunningInsideWorkflow(params) {
26
+ try {
27
+ const meta = getWorkflowMetadata();
28
+ const runId = meta?.workflowRunId;
29
+ if (!runId) {
30
+ throw new Error("Missing workflowRunId");
31
+ }
32
+ return meta;
33
+ }
34
+ catch (e) {
35
+ const msg = e instanceof Error ? e.message : String(e);
36
+ throw new Error(`@ekairos/structure: structure().build() must be called from a "use workflow" function. ` +
37
+ `datasetId=${params.datasetId}. getWorkflowMetadata failed: ${msg}`);
38
+ }
39
+ }
23
40
  function guessTextFileExtension(mimeType, name) {
24
41
  const n = String(name ?? "").toLowerCase();
25
42
  if (n.includes("."))
@@ -324,6 +341,9 @@ export function structure(env, opts) {
324
341
  console.log("[ekairos/structure] structure.ts build mode", mode);
325
342
  console.log("[ekairos/structure] structure.ts build output", output);
326
343
  console.log("[ekairos/structure] structure.ts build sourcesCount", sources.length);
344
+ // Guardrail: structure build MUST run inside workflow runtime ("use workflow").
345
+ const workflowMeta = assertRunningInsideWorkflow({ datasetId });
346
+ void workflowMeta?.workflowRunId;
327
347
  const contextKey = `structure:${datasetId}`;
328
348
  const storyConfig = {
329
349
  datasetId,
@@ -464,6 +484,39 @@ export function structure(env, opts) {
464
484
  done: page.done,
465
485
  };
466
486
  },
487
+ split: async (cursorOrParams, limit) => {
488
+ if (output !== "rows") {
489
+ throw new Error("reader.split() is only supported for output=rows");
490
+ }
491
+ if (!rowsSandboxRef) {
492
+ rowsSandboxRef = await structureDownloadRowsOutputToSandboxStep({
493
+ env,
494
+ structureId: datasetId,
495
+ });
496
+ }
497
+ const params = cursorOrParams && typeof cursorOrParams === "object" && ("cursor" in cursorOrParams || "limit" in cursorOrParams)
498
+ ? cursorOrParams
499
+ : {
500
+ cursor: cursorOrParams,
501
+ limit,
502
+ datasetId: undefined,
503
+ };
504
+ const childDatasetId = params?.datasetId ?? createUuidV4();
505
+ const res = await structureSplitRowsOutputToDatasetStep({
506
+ env,
507
+ sandboxId: rowsSandboxRef.sandboxId,
508
+ localPath: rowsSandboxRef.localPath,
509
+ cursor: params?.cursor,
510
+ limit: params?.limit ?? 300,
511
+ childDatasetId,
512
+ });
513
+ return {
514
+ datasetId: res.datasetId,
515
+ rowsWritten: res.rowsWritten,
516
+ cursor: res.nextCursor,
517
+ done: res.done,
518
+ };
519
+ },
467
520
  };
468
521
  console.log("[ekairos/structure] structure.ts build ok");
469
522
  return output === "object" ? { datasetId, reader, dataset: ctx } : { datasetId, reader };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/structure",
3
- "version": "1.21.62-beta.0",
3
+ "version": "1.21.67-beta.0",
4
4
  "description": "Ekairos Structure - Unified structured extraction (rows or object) from file/text/dataset inputs",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -36,12 +36,13 @@
36
36
  "typecheck": "tsc --noEmit"
37
37
  },
38
38
  "dependencies": {
39
- "@ekairos/domain": "^1.21.62-beta.0",
39
+ "@ekairos/domain": "^1.21.67-beta.0",
40
40
  "@ekairos/sandbox": "^1.21.60-beta.0",
41
41
  "@instantdb/admin": "^0.22.13",
42
42
  "@instantdb/core": "^0.22.13",
43
43
  "ai": "^5.0.95",
44
44
  "ajv": "^8.17.1",
45
+ "workflow": "4.0.1-beta.41",
45
46
  "zod": "^4.1.8"
46
47
  },
47
48
  "peerDependencies": {