npm - @ekairos/structure - Versions diffs - 1.21.60-beta.0 → 1.21.67-beta.0 - Mend

@ekairos/structure 1.21.60-beta.0 → 1.21.67-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/rowsOutputSplit.d.ts +25 -0
package/dist/rowsOutputSplit.js +131 -0
package/dist/structure.d.ts +22 -0
package/dist/structure.js +53 -0
package/package.json +3 -2

package/dist/rowsOutputSplit.d.ts ADDED Viewed

@@ -0,0 +1,25 @@
+import type { StructureRowsOutputPagingCursor } from "./rowsOutputPaging";
+export type StructureSplitRowsOutputToDatasetResult = {
+    datasetId?: string;
+    rowsWritten: number;
+    nextCursor: StructureRowsOutputPagingCursor;
+    done: boolean;
+};
+/**
+ * Step:
+ * Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
+ *
+ * Key property:
+ * - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
+ *
+ * This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
+ * through workflow/step params.
+ */
+export declare function structureSplitRowsOutputToDatasetStep(params: {
+    env: any;
+    sandboxId: string;
+    localPath: string;
+    cursor?: Partial<StructureRowsOutputPagingCursor>;
+    limit: number;
+    childDatasetId: string;
+}): Promise<StructureSplitRowsOutputToDatasetResult>;

package/dist/rowsOutputSplit.js ADDED Viewed

@@ -0,0 +1,131 @@
+import { getDatasetOutputPath, getDatasetWorkstation } from "./datasetFiles";
+import { readDatasetSandboxFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps";
+import { getStoryRuntime } from "./runtime";
+/**
+ * Step:
+ * Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
+ *
+ * Key property:
+ * - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
+ *
+ * This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
+ * through workflow/step params.
+ */
+export async function structureSplitRowsOutputToDatasetStep(params) {
+    "use step";
+    const byteOffset = params.cursor?.byteOffset ?? 0;
+    const rowOffset = params.cursor?.rowOffset ?? 0;
+    const workstation = getDatasetWorkstation(params.childDatasetId);
+    const outPath = getDatasetOutputPath(params.childDatasetId);
+    await runDatasetSandboxCommandStep({
+        env: params.env,
+        sandboxId: params.sandboxId,
+        cmd: "mkdir",
+        args: ["-p", workstation],
+    });
+    // Read from parent jsonl and write a child jsonl containing only ROW records, preserving `{ type, data }` lines.
+    const py = [
+        "import sys, json",
+        "in_path = sys.argv[1]",
+        "out_path = sys.argv[2]",
+        "byte_offset = int(sys.argv[3])",
+        "row_offset = int(sys.argv[4])",
+        "limit = int(sys.argv[5])",
+        "rows_written = 0",
+        "next_byte = byte_offset",
+        "next_row = row_offset",
+        "with open(in_path, 'rb') as f_in:",
+        "  f_in.seek(byte_offset)",
+        "  with open(out_path, 'wb') as f_out:",
+        "    while rows_written < limit:",
+        "      line = f_in.readline()",
+        "      if not line:",
+        "        break",
+        "      next_byte = f_in.tell()",
+        "      try:",
+        "        obj = json.loads(line.decode('utf-8'))",
+        "      except Exception:",
+        "        continue",
+        "      if obj.get('type') != 'row':",
+        "        continue",
+        "      f_out.write(line if line.endswith(b'\\n') else (line + b'\\n'))",
+        "      rows_written += 1",
+        "      next_row += 1",
+        "done = rows_written < limit",
+        "print(json.dumps({",
+        "  'rowsWritten': rows_written,",
+        "  'nextByteOffset': next_byte,",
+        "  'nextRowOffset': next_row,",
+        "  'done': done,",
+        "}))",
+    ].join("\n");
+    const res = await runDatasetSandboxCommandStep({
+        env: params.env,
+        sandboxId: params.sandboxId,
+        cmd: "python",
+        args: ["-c", py, params.localPath, outPath, String(byteOffset), String(rowOffset), String(params.limit)],
+    });
+    if (res.exitCode !== 0) {
+        throw new Error(res.stderr || "Failed to split rows output to dataset");
+    }
+    const parsed = JSON.parse(String(res.stdout ?? "").trim());
+    const rowsWritten = Number(parsed?.rowsWritten ?? 0);
+    const nextCursor = {
+        byteOffset: Number(parsed?.nextByteOffset ?? byteOffset),
+        rowOffset: Number(parsed?.nextRowOffset ?? rowOffset),
+    };
+    const done = Boolean(parsed?.done);
+    // No work to persist: return only paging state.
+    if (rowsWritten <= 0) {
+        return { datasetId: undefined, rowsWritten: 0, nextCursor, done: true };
+    }
+    const fileRes = await readDatasetSandboxFileStep({
+        env: params.env,
+        sandboxId: params.sandboxId,
+        path: outPath,
+    });
+    const storyRuntime = await getStoryRuntime(params.env);
+    const db = storyRuntime.db;
+    const store = storyRuntime.store;
+    const storagePath = `/structure/${params.childDatasetId}/output.jsonl`;
+    const fileBuffer = Buffer.from(fileRes.contentBase64 ?? "", "base64");
+    const uploadResult = await db.storage.uploadFile(storagePath, fileBuffer, {
+        contentType: "application/x-ndjson",
+        contentDisposition: "output.jsonl",
+    });
+    const fileId = uploadResult?.data?.id;
+    if (!fileId)
+        throw new Error("Failed to upload child dataset output file to storage");
+    const contextKey = `structure:${params.childDatasetId}`;
+    const ctx = await store.getOrCreateContext({ key: contextKey });
+    const ctxId = ctx?.id;
+    if (!ctxId)
+        throw new Error("Failed to create child dataset context");
+    // Link the output file to the context (used by DatasetService.readRecordsFromFile).
+    await db.transact([db.tx.context_contexts[ctxId].link({ structure_output_file: fileId })]);
+    // Patch metadata under `structure` namespace (never clobber Story runtime keys).
+    const existingContent = (ctx?.content ?? {});
+    const existingStructure = (existingContent?.structure ?? {});
+    const updatedAt = Date.now();
+    await store.updateContextContent({ key: contextKey }, {
+        ...existingContent,
+        structure: {
+            ...existingStructure,
+            kind: "ekairos.structure",
+            version: Number(existingStructure?.version ?? 1),
+            structureId: params.childDatasetId,
+            output: "rows",
+            updatedAt,
+            outputs: {
+                ...(existingStructure?.outputs ?? {}),
+                rows: {
+                    format: "jsonl",
+                    fileId,
+                    storagePath,
+                    rowCount: rowsWritten,
+                },
+            },
+        },
+    });
+    return { datasetId: params.childDatasetId, rowsWritten, nextCursor, done };
+}

package/dist/structure.d.ts CHANGED Viewed

@@ -18,6 +18,16 @@ export type StructureRowsReadResult = {
     cursor: StructureRowsOutputPagingCursor;
     done: boolean;
 };
+export type StructureRowsSplitResult = {
+    /**
+     * Child datasetId containing a JSONL `output.jsonl` with up to `limit` ROW entries.
+     * Omitted when there are no more rows to split.
+     */
+    datasetId?: string;
+    rowsWritten: number;
+    cursor: StructureRowsOutputPagingCursor;
+    done: boolean;
+};
 export type StructureRowsReader = {
     /**
      * Workflow-friendly rows reader.
@@ -33,6 +43,18 @@ export type StructureRowsReader = {
         cursor?: Partial<StructureRowsOutputPagingCursor>;
         limit?: number;
     }): Promise<StructureRowsReadResult>;
+    /**
+     * Split the rows output into a child dataset (jsonl) and return paging state.
+     *
+     * Unlike `read()`, this does not return `rows[]` (avoids moving payloads through params/results).
+     */
+    split(): Promise<StructureRowsSplitResult>;
+    split(cursor?: Partial<StructureRowsOutputPagingCursor>, limit?: number): Promise<StructureRowsSplitResult>;
+    split(params?: {
+        cursor?: Partial<StructureRowsOutputPagingCursor>;
+        limit?: number;
+        datasetId?: string;
+    }): Promise<StructureRowsSplitResult>;
 };
 export type StructureBuildResult = {
     datasetId: string;

package/dist/structure.js CHANGED Viewed

@@ -1,9 +1,11 @@
 import { createStory, didToolExecute, USER_MESSAGE_TYPE, WEB_CHANNEL } from "@ekairos/story";
 import { getDatasetOutputPath, getDatasetOutputSchemaPath, getDatasetWorkstation } from "./datasetFiles";
 import { structureDownloadRowsOutputToSandboxStep, structureReadRowsOutputPageFromSandboxStep, } from "./rowsOutputPaging";
+import { structureSplitRowsOutputToDatasetStep } from "./rowsOutputSplit";
 import { createDatasetSandboxStep, readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFileStep, } from "./sandbox/steps";
 import { readInstantFileStep } from "./file/steps";
 import { structureGetContextStep, structureGetContextWithRowsOutputFileStep, structureReadRowsOutputJsonlStep, } from "./dataset/steps";
+import { getWorkflowMetadata } from "workflow";
 import { buildStructurePrompt } from "./prompts";
 import { createExecuteCommandTool } from "./executeCommand.tool";
 import { createGenerateSchemaTool } from "./generateSchema.tool";
@@ -20,6 +22,21 @@ function createUuidV4() {
         return v.toString(16);
     });
 }
+function assertRunningInsideWorkflow(params) {
+    try {
+        const meta = getWorkflowMetadata();
+        const runId = meta?.workflowRunId;
+        if (!runId) {
+            throw new Error("Missing workflowRunId");
+        }
+        return meta;
+    }
+    catch (e) {
+        const msg = e instanceof Error ? e.message : String(e);
+        throw new Error(`@ekairos/structure: structure().build() must be called from a "use workflow" function. ` +
+            `datasetId=${params.datasetId}. getWorkflowMetadata failed: ${msg}`);
+    }
+}
 function guessTextFileExtension(mimeType, name) {
     const n = String(name ?? "").toLowerCase();
     if (n.includes("."))
@@ -324,6 +341,9 @@ export function structure(env, opts) {
             console.log("[ekairos/structure] structure.ts build mode", mode);
             console.log("[ekairos/structure] structure.ts build output", output);
             console.log("[ekairos/structure] structure.ts build sourcesCount", sources.length);
+            // Guardrail: structure build MUST run inside workflow runtime ("use workflow").
+            const workflowMeta = assertRunningInsideWorkflow({ datasetId });
+            void workflowMeta?.workflowRunId;
             const contextKey = `structure:${datasetId}`;
             const storyConfig = {
                 datasetId,
@@ -464,6 +484,39 @@ export function structure(env, opts) {
                         done: page.done,
                     };
                 },
+                split: async (cursorOrParams, limit) => {
+                    if (output !== "rows") {
+                        throw new Error("reader.split() is only supported for output=rows");
+                    }
+                    if (!rowsSandboxRef) {
+                        rowsSandboxRef = await structureDownloadRowsOutputToSandboxStep({
+                            env,
+                            structureId: datasetId,
+                        });
+                    }
+                    const params = cursorOrParams && typeof cursorOrParams === "object" && ("cursor" in cursorOrParams || "limit" in cursorOrParams)
+                        ? cursorOrParams
+                        : {
+                            cursor: cursorOrParams,
+                            limit,
+                            datasetId: undefined,
+                        };
+                    const childDatasetId = params?.datasetId ?? createUuidV4();
+                    const res = await structureSplitRowsOutputToDatasetStep({
+                        env,
+                        sandboxId: rowsSandboxRef.sandboxId,
+                        localPath: rowsSandboxRef.localPath,
+                        cursor: params?.cursor,
+                        limit: params?.limit ?? 300,
+                        childDatasetId,
+                    });
+                    return {
+                        datasetId: res.datasetId,
+                        rowsWritten: res.rowsWritten,
+                        cursor: res.nextCursor,
+                        done: res.done,
+                    };
+                },
             };
             console.log("[ekairos/structure] structure.ts build ok");
             return output === "object" ? { datasetId, reader, dataset: ctx } : { datasetId, reader };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ekairos/structure",
-  "version": "1.21.60-beta.0",
+  "version": "1.21.67-beta.0",
   "description": "Ekairos Structure - Unified structured extraction (rows or object) from file/text/dataset inputs",
   "type": "module",
   "main": "dist/index.js",
@@ -36,12 +36,13 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@ekairos/domain": "^1.21.60-beta.0",
+    "@ekairos/domain": "^1.21.67-beta.0",
     "@ekairos/sandbox": "^1.21.60-beta.0",
     "@instantdb/admin": "^0.22.13",
     "@instantdb/core": "^0.22.13",
     "ai": "^5.0.95",
     "ajv": "^8.17.1",
+    "workflow": "4.0.1-beta.41",
     "zod": "^4.1.8"
   },
   "peerDependencies": {