@ekairos/structure 1.21.60-beta.0 → 1.21.67-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rowsOutputSplit.d.ts +25 -0
- package/dist/rowsOutputSplit.js +131 -0
- package/dist/structure.d.ts +22 -0
- package/dist/structure.js +53 -0
- package/package.json +3 -2
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { StructureRowsOutputPagingCursor } from "./rowsOutputPaging";
|
|
2
|
+
export type StructureSplitRowsOutputToDatasetResult = {
|
|
3
|
+
datasetId?: string;
|
|
4
|
+
rowsWritten: number;
|
|
5
|
+
nextCursor: StructureRowsOutputPagingCursor;
|
|
6
|
+
done: boolean;
|
|
7
|
+
};
|
|
8
|
+
/**
|
|
9
|
+
* Step:
|
|
10
|
+
* Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
|
|
11
|
+
*
|
|
12
|
+
* Key property:
|
|
13
|
+
* - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
|
|
14
|
+
*
|
|
15
|
+
* This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
|
|
16
|
+
* through workflow/step params.
|
|
17
|
+
*/
|
|
18
|
+
export declare function structureSplitRowsOutputToDatasetStep(params: {
|
|
19
|
+
env: any;
|
|
20
|
+
sandboxId: string;
|
|
21
|
+
localPath: string;
|
|
22
|
+
cursor?: Partial<StructureRowsOutputPagingCursor>;
|
|
23
|
+
limit: number;
|
|
24
|
+
childDatasetId: string;
|
|
25
|
+
}): Promise<StructureSplitRowsOutputToDatasetResult>;
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { getDatasetOutputPath, getDatasetWorkstation } from "./datasetFiles";
|
|
2
|
+
import { readDatasetSandboxFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps";
|
|
3
|
+
import { getStoryRuntime } from "./runtime";
|
|
4
|
+
/**
|
|
5
|
+
* Step:
|
|
6
|
+
* Split a sandbox-local `output.jsonl` into a child dataset (also `output.jsonl`) of up to `limit` ROW entries.
|
|
7
|
+
*
|
|
8
|
+
* Key property:
|
|
9
|
+
* - Does NOT return rows; it persists a child dataset and returns only `{ datasetId, nextCursor, done }`.
|
|
10
|
+
*
|
|
11
|
+
* This is useful for workflows where you want to batch work (e.g. 300 rows) without moving large payloads
|
|
12
|
+
* through workflow/step params.
|
|
13
|
+
*/
|
|
14
|
+
export async function structureSplitRowsOutputToDatasetStep(params) {
|
|
15
|
+
"use step";
|
|
16
|
+
const byteOffset = params.cursor?.byteOffset ?? 0;
|
|
17
|
+
const rowOffset = params.cursor?.rowOffset ?? 0;
|
|
18
|
+
const workstation = getDatasetWorkstation(params.childDatasetId);
|
|
19
|
+
const outPath = getDatasetOutputPath(params.childDatasetId);
|
|
20
|
+
await runDatasetSandboxCommandStep({
|
|
21
|
+
env: params.env,
|
|
22
|
+
sandboxId: params.sandboxId,
|
|
23
|
+
cmd: "mkdir",
|
|
24
|
+
args: ["-p", workstation],
|
|
25
|
+
});
|
|
26
|
+
// Read from parent jsonl and write a child jsonl containing only ROW records, preserving `{ type, data }` lines.
|
|
27
|
+
const py = [
|
|
28
|
+
"import sys, json",
|
|
29
|
+
"in_path = sys.argv[1]",
|
|
30
|
+
"out_path = sys.argv[2]",
|
|
31
|
+
"byte_offset = int(sys.argv[3])",
|
|
32
|
+
"row_offset = int(sys.argv[4])",
|
|
33
|
+
"limit = int(sys.argv[5])",
|
|
34
|
+
"rows_written = 0",
|
|
35
|
+
"next_byte = byte_offset",
|
|
36
|
+
"next_row = row_offset",
|
|
37
|
+
"with open(in_path, 'rb') as f_in:",
|
|
38
|
+
" f_in.seek(byte_offset)",
|
|
39
|
+
" with open(out_path, 'wb') as f_out:",
|
|
40
|
+
" while rows_written < limit:",
|
|
41
|
+
" line = f_in.readline()",
|
|
42
|
+
" if not line:",
|
|
43
|
+
" break",
|
|
44
|
+
" next_byte = f_in.tell()",
|
|
45
|
+
" try:",
|
|
46
|
+
" obj = json.loads(line.decode('utf-8'))",
|
|
47
|
+
" except Exception:",
|
|
48
|
+
" continue",
|
|
49
|
+
" if obj.get('type') != 'row':",
|
|
50
|
+
" continue",
|
|
51
|
+
" f_out.write(line if line.endswith(b'\\n') else (line + b'\\n'))",
|
|
52
|
+
" rows_written += 1",
|
|
53
|
+
" next_row += 1",
|
|
54
|
+
"done = rows_written < limit",
|
|
55
|
+
"print(json.dumps({",
|
|
56
|
+
" 'rowsWritten': rows_written,",
|
|
57
|
+
" 'nextByteOffset': next_byte,",
|
|
58
|
+
" 'nextRowOffset': next_row,",
|
|
59
|
+
" 'done': done,",
|
|
60
|
+
"}))",
|
|
61
|
+
].join("\n");
|
|
62
|
+
const res = await runDatasetSandboxCommandStep({
|
|
63
|
+
env: params.env,
|
|
64
|
+
sandboxId: params.sandboxId,
|
|
65
|
+
cmd: "python",
|
|
66
|
+
args: ["-c", py, params.localPath, outPath, String(byteOffset), String(rowOffset), String(params.limit)],
|
|
67
|
+
});
|
|
68
|
+
if (res.exitCode !== 0) {
|
|
69
|
+
throw new Error(res.stderr || "Failed to split rows output to dataset");
|
|
70
|
+
}
|
|
71
|
+
const parsed = JSON.parse(String(res.stdout ?? "").trim());
|
|
72
|
+
const rowsWritten = Number(parsed?.rowsWritten ?? 0);
|
|
73
|
+
const nextCursor = {
|
|
74
|
+
byteOffset: Number(parsed?.nextByteOffset ?? byteOffset),
|
|
75
|
+
rowOffset: Number(parsed?.nextRowOffset ?? rowOffset),
|
|
76
|
+
};
|
|
77
|
+
const done = Boolean(parsed?.done);
|
|
78
|
+
// No work to persist: return only paging state.
|
|
79
|
+
if (rowsWritten <= 0) {
|
|
80
|
+
return { datasetId: undefined, rowsWritten: 0, nextCursor, done: true };
|
|
81
|
+
}
|
|
82
|
+
const fileRes = await readDatasetSandboxFileStep({
|
|
83
|
+
env: params.env,
|
|
84
|
+
sandboxId: params.sandboxId,
|
|
85
|
+
path: outPath,
|
|
86
|
+
});
|
|
87
|
+
const storyRuntime = await getStoryRuntime(params.env);
|
|
88
|
+
const db = storyRuntime.db;
|
|
89
|
+
const store = storyRuntime.store;
|
|
90
|
+
const storagePath = `/structure/${params.childDatasetId}/output.jsonl`;
|
|
91
|
+
const fileBuffer = Buffer.from(fileRes.contentBase64 ?? "", "base64");
|
|
92
|
+
const uploadResult = await db.storage.uploadFile(storagePath, fileBuffer, {
|
|
93
|
+
contentType: "application/x-ndjson",
|
|
94
|
+
contentDisposition: "output.jsonl",
|
|
95
|
+
});
|
|
96
|
+
const fileId = uploadResult?.data?.id;
|
|
97
|
+
if (!fileId)
|
|
98
|
+
throw new Error("Failed to upload child dataset output file to storage");
|
|
99
|
+
const contextKey = `structure:${params.childDatasetId}`;
|
|
100
|
+
const ctx = await store.getOrCreateContext({ key: contextKey });
|
|
101
|
+
const ctxId = ctx?.id;
|
|
102
|
+
if (!ctxId)
|
|
103
|
+
throw new Error("Failed to create child dataset context");
|
|
104
|
+
// Link the output file to the context (used by DatasetService.readRecordsFromFile).
|
|
105
|
+
await db.transact([db.tx.context_contexts[ctxId].link({ structure_output_file: fileId })]);
|
|
106
|
+
// Patch metadata under `structure` namespace (never clobber Story runtime keys).
|
|
107
|
+
const existingContent = (ctx?.content ?? {});
|
|
108
|
+
const existingStructure = (existingContent?.structure ?? {});
|
|
109
|
+
const updatedAt = Date.now();
|
|
110
|
+
await store.updateContextContent({ key: contextKey }, {
|
|
111
|
+
...existingContent,
|
|
112
|
+
structure: {
|
|
113
|
+
...existingStructure,
|
|
114
|
+
kind: "ekairos.structure",
|
|
115
|
+
version: Number(existingStructure?.version ?? 1),
|
|
116
|
+
structureId: params.childDatasetId,
|
|
117
|
+
output: "rows",
|
|
118
|
+
updatedAt,
|
|
119
|
+
outputs: {
|
|
120
|
+
...(existingStructure?.outputs ?? {}),
|
|
121
|
+
rows: {
|
|
122
|
+
format: "jsonl",
|
|
123
|
+
fileId,
|
|
124
|
+
storagePath,
|
|
125
|
+
rowCount: rowsWritten,
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
});
|
|
130
|
+
return { datasetId: params.childDatasetId, rowsWritten, nextCursor, done };
|
|
131
|
+
}
|
package/dist/structure.d.ts
CHANGED
|
@@ -18,6 +18,16 @@ export type StructureRowsReadResult = {
|
|
|
18
18
|
cursor: StructureRowsOutputPagingCursor;
|
|
19
19
|
done: boolean;
|
|
20
20
|
};
|
|
21
|
+
export type StructureRowsSplitResult = {
|
|
22
|
+
/**
|
|
23
|
+
* Child datasetId containing a JSONL `output.jsonl` with up to `limit` ROW entries.
|
|
24
|
+
* Omitted when there are no more rows to split.
|
|
25
|
+
*/
|
|
26
|
+
datasetId?: string;
|
|
27
|
+
rowsWritten: number;
|
|
28
|
+
cursor: StructureRowsOutputPagingCursor;
|
|
29
|
+
done: boolean;
|
|
30
|
+
};
|
|
21
31
|
export type StructureRowsReader = {
|
|
22
32
|
/**
|
|
23
33
|
* Workflow-friendly rows reader.
|
|
@@ -33,6 +43,18 @@ export type StructureRowsReader = {
|
|
|
33
43
|
cursor?: Partial<StructureRowsOutputPagingCursor>;
|
|
34
44
|
limit?: number;
|
|
35
45
|
}): Promise<StructureRowsReadResult>;
|
|
46
|
+
/**
|
|
47
|
+
* Split the rows output into a child dataset (jsonl) and return paging state.
|
|
48
|
+
*
|
|
49
|
+
* Unlike `read()`, this does not return `rows[]` (avoids moving payloads through params/results).
|
|
50
|
+
*/
|
|
51
|
+
split(): Promise<StructureRowsSplitResult>;
|
|
52
|
+
split(cursor?: Partial<StructureRowsOutputPagingCursor>, limit?: number): Promise<StructureRowsSplitResult>;
|
|
53
|
+
split(params?: {
|
|
54
|
+
cursor?: Partial<StructureRowsOutputPagingCursor>;
|
|
55
|
+
limit?: number;
|
|
56
|
+
datasetId?: string;
|
|
57
|
+
}): Promise<StructureRowsSplitResult>;
|
|
36
58
|
};
|
|
37
59
|
export type StructureBuildResult = {
|
|
38
60
|
datasetId: string;
|
package/dist/structure.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import { createStory, didToolExecute, USER_MESSAGE_TYPE, WEB_CHANNEL } from "@ekairos/story";
|
|
2
2
|
import { getDatasetOutputPath, getDatasetOutputSchemaPath, getDatasetWorkstation } from "./datasetFiles";
|
|
3
3
|
import { structureDownloadRowsOutputToSandboxStep, structureReadRowsOutputPageFromSandboxStep, } from "./rowsOutputPaging";
|
|
4
|
+
import { structureSplitRowsOutputToDatasetStep } from "./rowsOutputSplit";
|
|
4
5
|
import { createDatasetSandboxStep, readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFileStep, } from "./sandbox/steps";
|
|
5
6
|
import { readInstantFileStep } from "./file/steps";
|
|
6
7
|
import { structureGetContextStep, structureGetContextWithRowsOutputFileStep, structureReadRowsOutputJsonlStep, } from "./dataset/steps";
|
|
8
|
+
import { getWorkflowMetadata } from "workflow";
|
|
7
9
|
import { buildStructurePrompt } from "./prompts";
|
|
8
10
|
import { createExecuteCommandTool } from "./executeCommand.tool";
|
|
9
11
|
import { createGenerateSchemaTool } from "./generateSchema.tool";
|
|
@@ -20,6 +22,21 @@ function createUuidV4() {
|
|
|
20
22
|
return v.toString(16);
|
|
21
23
|
});
|
|
22
24
|
}
|
|
25
|
+
function assertRunningInsideWorkflow(params) {
|
|
26
|
+
try {
|
|
27
|
+
const meta = getWorkflowMetadata();
|
|
28
|
+
const runId = meta?.workflowRunId;
|
|
29
|
+
if (!runId) {
|
|
30
|
+
throw new Error("Missing workflowRunId");
|
|
31
|
+
}
|
|
32
|
+
return meta;
|
|
33
|
+
}
|
|
34
|
+
catch (e) {
|
|
35
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
36
|
+
throw new Error(`@ekairos/structure: structure().build() must be called from a "use workflow" function. ` +
|
|
37
|
+
`datasetId=${params.datasetId}. getWorkflowMetadata failed: ${msg}`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
23
40
|
function guessTextFileExtension(mimeType, name) {
|
|
24
41
|
const n = String(name ?? "").toLowerCase();
|
|
25
42
|
if (n.includes("."))
|
|
@@ -324,6 +341,9 @@ export function structure(env, opts) {
|
|
|
324
341
|
console.log("[ekairos/structure] structure.ts build mode", mode);
|
|
325
342
|
console.log("[ekairos/structure] structure.ts build output", output);
|
|
326
343
|
console.log("[ekairos/structure] structure.ts build sourcesCount", sources.length);
|
|
344
|
+
// Guardrail: structure build MUST run inside workflow runtime ("use workflow").
|
|
345
|
+
const workflowMeta = assertRunningInsideWorkflow({ datasetId });
|
|
346
|
+
void workflowMeta?.workflowRunId;
|
|
327
347
|
const contextKey = `structure:${datasetId}`;
|
|
328
348
|
const storyConfig = {
|
|
329
349
|
datasetId,
|
|
@@ -464,6 +484,39 @@ export function structure(env, opts) {
|
|
|
464
484
|
done: page.done,
|
|
465
485
|
};
|
|
466
486
|
},
|
|
487
|
+
split: async (cursorOrParams, limit) => {
|
|
488
|
+
if (output !== "rows") {
|
|
489
|
+
throw new Error("reader.split() is only supported for output=rows");
|
|
490
|
+
}
|
|
491
|
+
if (!rowsSandboxRef) {
|
|
492
|
+
rowsSandboxRef = await structureDownloadRowsOutputToSandboxStep({
|
|
493
|
+
env,
|
|
494
|
+
structureId: datasetId,
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
const params = cursorOrParams && typeof cursorOrParams === "object" && ("cursor" in cursorOrParams || "limit" in cursorOrParams)
|
|
498
|
+
? cursorOrParams
|
|
499
|
+
: {
|
|
500
|
+
cursor: cursorOrParams,
|
|
501
|
+
limit,
|
|
502
|
+
datasetId: undefined,
|
|
503
|
+
};
|
|
504
|
+
const childDatasetId = params?.datasetId ?? createUuidV4();
|
|
505
|
+
const res = await structureSplitRowsOutputToDatasetStep({
|
|
506
|
+
env,
|
|
507
|
+
sandboxId: rowsSandboxRef.sandboxId,
|
|
508
|
+
localPath: rowsSandboxRef.localPath,
|
|
509
|
+
cursor: params?.cursor,
|
|
510
|
+
limit: params?.limit ?? 300,
|
|
511
|
+
childDatasetId,
|
|
512
|
+
});
|
|
513
|
+
return {
|
|
514
|
+
datasetId: res.datasetId,
|
|
515
|
+
rowsWritten: res.rowsWritten,
|
|
516
|
+
cursor: res.nextCursor,
|
|
517
|
+
done: res.done,
|
|
518
|
+
};
|
|
519
|
+
},
|
|
467
520
|
};
|
|
468
521
|
console.log("[ekairos/structure] structure.ts build ok");
|
|
469
522
|
return output === "object" ? { datasetId, reader, dataset: ctx } : { datasetId, reader };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/structure",
|
|
3
|
-
"version": "1.21.
|
|
3
|
+
"version": "1.21.67-beta.0",
|
|
4
4
|
"description": "Ekairos Structure - Unified structured extraction (rows or object) from file/text/dataset inputs",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -36,12 +36,13 @@
|
|
|
36
36
|
"typecheck": "tsc --noEmit"
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
|
-
"@ekairos/domain": "^1.21.
|
|
39
|
+
"@ekairos/domain": "^1.21.67-beta.0",
|
|
40
40
|
"@ekairos/sandbox": "^1.21.60-beta.0",
|
|
41
41
|
"@instantdb/admin": "^0.22.13",
|
|
42
42
|
"@instantdb/core": "^0.22.13",
|
|
43
43
|
"ai": "^5.0.95",
|
|
44
44
|
"ajv": "^8.17.1",
|
|
45
|
+
"workflow": "4.0.1-beta.41",
|
|
45
46
|
"zod": "^4.1.8"
|
|
46
47
|
},
|
|
47
48
|
"peerDependencies": {
|