@ekairos/dataset 1.22.34-beta.development.0 → 1.22.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +347 -0
- package/dist/agents.d.ts +8 -0
- package/dist/agents.js +8 -0
- package/dist/builder/agentMaterializers.d.ts +9 -0
- package/dist/builder/agentMaterializers.js +10 -0
- package/dist/builder/context.d.ts +15 -0
- package/dist/builder/context.js +251 -0
- package/dist/builder/instructions.d.ts +5 -0
- package/dist/builder/instructions.js +40 -0
- package/dist/builder/materialize.d.ts +83 -0
- package/dist/builder/materialize.js +548 -0
- package/dist/builder/materializeQuery.d.ts +12 -0
- package/dist/builder/materializeQuery.js +31 -0
- package/dist/builder/persistence.d.ts +22 -0
- package/dist/builder/persistence.js +153 -0
- package/dist/builder/rows.d.ts +7 -0
- package/dist/builder/rows.js +56 -0
- package/dist/builder/schemaInference.d.ts +3 -0
- package/dist/builder/schemaInference.js +61 -0
- package/dist/builder/types.d.ts +140 -0
- package/dist/builder/types.js +1 -0
- package/dist/clearDataset.tool.d.ts +2 -3
- package/dist/clearDataset.tool.js +13 -17
- package/dist/completeDataset.steps.d.ts +117 -0
- package/dist/completeDataset.steps.js +487 -0
- package/dist/completeDataset.tool.d.ts +132 -7
- package/dist/completeDataset.tool.js +46 -192
- package/dist/contextResources.d.ts +31 -0
- package/dist/contextResources.js +151 -0
- package/dist/contextWorkspace.d.ts +79 -0
- package/dist/contextWorkspace.js +234 -0
- package/dist/dataset/steps.d.ts +39 -15
- package/dist/dataset/steps.js +96 -39
- package/dist/dataset.d.ts +3 -67
- package/dist/dataset.js +129 -520
- package/dist/datasetFiles.d.ts +5 -1
- package/dist/datasetFiles.js +29 -27
- package/dist/domain.d.ts +1 -2
- package/dist/domain.js +1 -6
- package/dist/executeCommand.tool.d.ts +2 -30
- package/dist/executeCommand.tool.js +165 -39
- package/dist/file/file-dataset.agent.d.ts +19 -56
- package/dist/file/file-dataset.agent.js +176 -132
- package/dist/file/file-dataset.steps.d.ts +27 -0
- package/dist/file/file-dataset.steps.js +47 -0
- package/dist/file/file-dataset.types.d.ts +64 -0
- package/dist/file/file-dataset.types.js +1 -0
- package/dist/file/filepreview.d.ts +5 -35
- package/dist/file/filepreview.js +60 -107
- package/dist/file/filepreview.types.d.ts +31 -0
- package/dist/file/filepreview.types.js +1 -0
- package/dist/file/generateSchema.tool.d.ts +2 -3
- package/dist/file/generateSchema.tool.js +11 -15
- package/dist/file/index.d.ts +1 -2
- package/dist/file/index.js +1 -18
- package/dist/file/prompts.d.ts +2 -3
- package/dist/file/prompts.js +134 -27
- package/dist/file/scripts.generated.d.ts +1 -0
- package/dist/file/scripts.generated.js +11 -0
- package/dist/file/steps.d.ts +1 -2
- package/dist/file/steps.js +9 -7
- package/dist/id.d.ts +1 -0
- package/dist/id.js +10 -0
- package/dist/index.d.ts +8 -7
- package/dist/index.js +8 -23
- package/dist/materializeDataset.tool.d.ts +52 -32
- package/dist/materializeDataset.tool.js +81 -65
- package/dist/query/index.d.ts +1 -2
- package/dist/query/index.js +1 -18
- package/dist/query/queryDomain.d.ts +3 -4
- package/dist/query/queryDomain.js +3 -40
- package/dist/query/queryDomain.step.d.ts +1 -1
- package/dist/query/queryDomain.step.js +13 -13
- package/dist/sandbox/steps.d.ts +23 -15
- package/dist/sandbox/steps.js +73 -76
- package/dist/sandbox.steps.d.ts +1 -2
- package/dist/sandbox.steps.js +1 -18
- package/dist/schema.d.ts +13 -13
- package/dist/schema.js +25 -37
- package/dist/service.d.ts +8 -5
- package/dist/service.js +70 -15
- package/dist/skill.d.ts +0 -1
- package/dist/skill.js +12 -17
- package/dist/transform/filepreview.d.ts +2 -3
- package/dist/transform/filepreview.js +9 -26
- package/dist/transform/index.d.ts +2 -3
- package/dist/transform/index.js +2 -8
- package/dist/transform/prompts.d.ts +1 -34
- package/dist/transform/prompts.js +58 -43
- package/dist/transform/transform-dataset.agent.d.ts +20 -45
- package/dist/transform/transform-dataset.agent.js +146 -89
- package/dist/transform/transform-dataset.steps.d.ts +30 -0
- package/dist/transform/transform-dataset.steps.js +61 -0
- package/dist/transform/transform-dataset.types.d.ts +95 -0
- package/dist/transform/transform-dataset.types.js +1 -0
- package/dist/transform/transformDataset.d.ts +3 -3
- package/dist/transform/transformDataset.js +15 -18
- package/dist/writeDatasetRows.tool.d.ts +188 -0
- package/dist/writeDatasetRows.tool.js +258 -0
- package/package.json +36 -11
- package/dist/clearDataset.tool.d.ts.map +0 -1
- package/dist/clearDataset.tool.js.map +0 -1
- package/dist/completeDataset.tool.d.ts.map +0 -1
- package/dist/completeDataset.tool.js.map +0 -1
- package/dist/dataset/steps.d.ts.map +0 -1
- package/dist/dataset/steps.js.map +0 -1
- package/dist/dataset.d.ts.map +0 -1
- package/dist/dataset.js.map +0 -1
- package/dist/datasetFiles.d.ts.map +0 -1
- package/dist/datasetFiles.js.map +0 -1
- package/dist/domain.d.ts.map +0 -1
- package/dist/domain.js.map +0 -1
- package/dist/executeCommand.tool.d.ts.map +0 -1
- package/dist/executeCommand.tool.js.map +0 -1
- package/dist/file/file-dataset.agent.d.ts.map +0 -1
- package/dist/file/file-dataset.agent.js.map +0 -1
- package/dist/file/filepreview.d.ts.map +0 -1
- package/dist/file/filepreview.js.map +0 -1
- package/dist/file/generateSchema.tool.d.ts.map +0 -1
- package/dist/file/generateSchema.tool.js.map +0 -1
- package/dist/file/index.d.ts.map +0 -1
- package/dist/file/index.js.map +0 -1
- package/dist/file/prompts.d.ts.map +0 -1
- package/dist/file/prompts.js.map +0 -1
- package/dist/file/steps.d.ts.map +0 -1
- package/dist/file/steps.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/materializeDataset.tool.d.ts.map +0 -1
- package/dist/materializeDataset.tool.js.map +0 -1
- package/dist/query/index.d.ts.map +0 -1
- package/dist/query/index.js.map +0 -1
- package/dist/query/queryDomain.d.ts.map +0 -1
- package/dist/query/queryDomain.js.map +0 -1
- package/dist/query/queryDomain.step.d.ts.map +0 -1
- package/dist/query/queryDomain.step.js.map +0 -1
- package/dist/sandbox/steps.d.ts.map +0 -1
- package/dist/sandbox/steps.js.map +0 -1
- package/dist/sandbox.steps.d.ts.map +0 -1
- package/dist/sandbox.steps.js.map +0 -1
- package/dist/schema.d.ts.map +0 -1
- package/dist/schema.js.map +0 -1
- package/dist/service.d.ts.map +0 -1
- package/dist/service.js.map +0 -1
- package/dist/skill.d.ts.map +0 -1
- package/dist/skill.js.map +0 -1
- package/dist/transform/filepreview.d.ts.map +0 -1
- package/dist/transform/filepreview.js.map +0 -1
- package/dist/transform/index.d.ts.map +0 -1
- package/dist/transform/index.js.map +0 -1
- package/dist/transform/prompts.d.ts.map +0 -1
- package/dist/transform/prompts.js.map +0 -1
- package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
- package/dist/transform/transform-dataset.agent.js.map +0 -1
- package/dist/transform/transformDataset.d.ts.map +0 -1
- package/dist/transform/transformDataset.js.map +0 -1
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export function buildFileDefaultInstructions(schema) {
|
|
2
|
+
if (schema) {
|
|
3
|
+
return "Create a dataset from the resource file and ensure each output row matches the provided dataset schema exactly.";
|
|
4
|
+
}
|
|
5
|
+
return "Create a dataset representing the resource content as structured rows.";
|
|
6
|
+
}
|
|
7
|
+
export function buildRawResourceInstructions(resourceKind) {
|
|
8
|
+
if (resourceKind === "text") {
|
|
9
|
+
return "Create a dataset representing the raw text content as structured rows without applying business transformations.";
|
|
10
|
+
}
|
|
11
|
+
return "Create a dataset representing the raw file content as structured rows without applying business transformations.";
|
|
12
|
+
}
|
|
13
|
+
export function buildTransformInstructions(resourceCount, userInstructions, schema) {
|
|
14
|
+
const explicit = String(userInstructions ?? "").trim();
|
|
15
|
+
if (explicit)
|
|
16
|
+
return explicit;
|
|
17
|
+
if (resourceCount > 1) {
|
|
18
|
+
if (schema) {
|
|
19
|
+
return "Combine the input datasets into a new dataset that matches the provided output schema exactly.";
|
|
20
|
+
}
|
|
21
|
+
return "Combine the input datasets into one coherent dataset.";
|
|
22
|
+
}
|
|
23
|
+
if (schema) {
|
|
24
|
+
return "Transform the input dataset into a new dataset that matches the provided output schema exactly.";
|
|
25
|
+
}
|
|
26
|
+
return "Transform the input dataset into a new useful dataset.";
|
|
27
|
+
}
|
|
28
|
+
export function buildObjectOutputInstructions(userInstructions) {
|
|
29
|
+
const base = String(userInstructions ?? "").trim();
|
|
30
|
+
const objectContract = [
|
|
31
|
+
"Output mode is object.",
|
|
32
|
+
"Produce exactly one final object.",
|
|
33
|
+
"completeObject({ data: <the final object>, summary }) is available to complete the dataset directly.",
|
|
34
|
+
"If you use output.jsonl instead, produce exactly one row: {\"type\":\"row\",\"data\":<the final object>}.",
|
|
35
|
+
"Do not emit multiple rows, headers, summaries, or metadata rows.",
|
|
36
|
+
].join("\n");
|
|
37
|
+
if (!base)
|
|
38
|
+
return objectContract;
|
|
39
|
+
return [base, "", objectContract].join("\n");
|
|
40
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
|
|
2
|
+
import type { SandboxState } from "../file/file-dataset.types.js";
|
|
3
|
+
import type { FilePreviewContext } from "../file/filepreview.types.js";
|
|
4
|
+
import type { TransformSandboxState, TransformInputPreviewContext } from "../transform/transform-dataset.types.js";
|
|
5
|
+
export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
|
|
6
|
+
type PreparedFileDatasetContext = {
|
|
7
|
+
kind: "file";
|
|
8
|
+
datasetId: string;
|
|
9
|
+
sandboxId: string;
|
|
10
|
+
fileId: string;
|
|
11
|
+
sandboxState: SandboxState;
|
|
12
|
+
filePreview?: FilePreviewContext;
|
|
13
|
+
schema?: DatasetSchemaInput | null;
|
|
14
|
+
filename?: string;
|
|
15
|
+
mediaType?: string;
|
|
16
|
+
};
|
|
17
|
+
type PreparedTransformDatasetContext = {
|
|
18
|
+
kind: "transform";
|
|
19
|
+
datasetId: string;
|
|
20
|
+
sandboxId: string;
|
|
21
|
+
inputDatasetIds: string[];
|
|
22
|
+
outputSchema: DatasetSchemaInput;
|
|
23
|
+
sandboxState: TransformSandboxState;
|
|
24
|
+
inputPreviews?: Array<{
|
|
25
|
+
datasetId: string;
|
|
26
|
+
preview: TransformInputPreviewContext;
|
|
27
|
+
}>;
|
|
28
|
+
};
|
|
29
|
+
type PreparedDatasetContext = PreparedFileDatasetContext | PreparedTransformDatasetContext;
|
|
30
|
+
type DatasetContextInitialization = PreparedDatasetContext & {
|
|
31
|
+
prompt: string;
|
|
32
|
+
instructions?: string;
|
|
33
|
+
};
|
|
34
|
+
export declare function initializeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
35
|
+
runtime: Runtime;
|
|
36
|
+
datasetId: string;
|
|
37
|
+
sandboxId: string;
|
|
38
|
+
title?: string;
|
|
39
|
+
instructions?: string;
|
|
40
|
+
contextId: string;
|
|
41
|
+
schema?: DatasetSchemaInput;
|
|
42
|
+
}): Promise<{
|
|
43
|
+
datasetId: string;
|
|
44
|
+
sandboxId: string;
|
|
45
|
+
}>;
|
|
46
|
+
export declare function prepareDatasetResourcesStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
47
|
+
kind: "file";
|
|
48
|
+
runtime: Runtime;
|
|
49
|
+
datasetId: string;
|
|
50
|
+
sandboxId: string;
|
|
51
|
+
resource: Extract<InternalDatasetResource, {
|
|
52
|
+
kind: "file" | "text";
|
|
53
|
+
}>;
|
|
54
|
+
schema?: DatasetSchemaInput;
|
|
55
|
+
} | {
|
|
56
|
+
kind: "transform";
|
|
57
|
+
runtime: Runtime;
|
|
58
|
+
datasetId: string;
|
|
59
|
+
sandboxId: string;
|
|
60
|
+
inputDatasetIds: string[];
|
|
61
|
+
outputSchema: DatasetSchemaInput;
|
|
62
|
+
}): Promise<PreparedDatasetContext>;
|
|
63
|
+
export declare function initializeDatasetContextStep(params: {
|
|
64
|
+
prepared: PreparedDatasetContext;
|
|
65
|
+
instructions?: string;
|
|
66
|
+
outputSchema?: DatasetSchemaInput;
|
|
67
|
+
}): Promise<DatasetContextInitialization>;
|
|
68
|
+
export declare function completeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
69
|
+
runtime: Runtime;
|
|
70
|
+
datasetId: string;
|
|
71
|
+
schema?: DatasetSchemaInput;
|
|
72
|
+
first: boolean;
|
|
73
|
+
}): Promise<{
|
|
74
|
+
datasetId: string;
|
|
75
|
+
dataset: any;
|
|
76
|
+
previewRows: any[];
|
|
77
|
+
firstRow: any;
|
|
78
|
+
}>;
|
|
79
|
+
export declare function materializeSingleFileLikeResource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, resource: Extract<InternalDatasetResource, {
|
|
80
|
+
kind: "file" | "text";
|
|
81
|
+
}>, targetDatasetId: string): Promise<string>;
|
|
82
|
+
export declare function materializeDerivedDataset<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, targetDatasetId: string): Promise<string>;
|
|
83
|
+
export {};
|
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
|
+
import { readInstantFileStep } from "../file/steps.js";
|
|
3
|
+
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
4
|
+
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
5
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
6
|
+
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
7
|
+
import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
8
|
+
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
|
|
9
|
+
import { materializeQueryResource } from "./materializeQuery.js";
|
|
10
|
+
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
11
|
+
function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
|
|
12
|
+
return `${targetDatasetId}__${resourceKind}_${index}`;
|
|
13
|
+
}
|
|
14
|
+
function normalizeParsedTextRows(value) {
|
|
15
|
+
if (Array.isArray(value)) {
|
|
16
|
+
return value.map((item) => (item && typeof item === "object" ? item : { value: item }));
|
|
17
|
+
}
|
|
18
|
+
if (value && typeof value === "object")
|
|
19
|
+
return [value];
|
|
20
|
+
return [{ value }];
|
|
21
|
+
}
|
|
22
|
+
function materializeRawTextRows(resource) {
|
|
23
|
+
const text = String(resource.text ?? "");
|
|
24
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
25
|
+
const name = String(resource.name ?? "").toLowerCase();
|
|
26
|
+
const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
|
|
27
|
+
if (shouldParseJson) {
|
|
28
|
+
try {
|
|
29
|
+
if (name.endsWith(".jsonl")) {
|
|
30
|
+
const rows = text
|
|
31
|
+
.split(/\r?\n/g)
|
|
32
|
+
.map((line) => line.trim())
|
|
33
|
+
.filter(Boolean)
|
|
34
|
+
.map((line) => JSON.parse(line));
|
|
35
|
+
return rows.flatMap((row) => normalizeParsedTextRows(row));
|
|
36
|
+
}
|
|
37
|
+
return normalizeParsedTextRows(JSON.parse(text));
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
return [{ text }];
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return [{ text }];
|
|
44
|
+
}
|
|
45
|
+
function parseContentDispositionFileName(value) {
|
|
46
|
+
const text = String(value ?? "");
|
|
47
|
+
const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
|
|
48
|
+
if (utf8Match?.[1]) {
|
|
49
|
+
try {
|
|
50
|
+
return decodeURIComponent(utf8Match[1]).trim();
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return utf8Match[1].trim();
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const quotedMatch = /filename="([^"]+)"/i.exec(text);
|
|
57
|
+
if (quotedMatch?.[1])
|
|
58
|
+
return quotedMatch[1].trim();
|
|
59
|
+
const plainMatch = /filename=([^;]+)/i.exec(text);
|
|
60
|
+
if (plainMatch?.[1])
|
|
61
|
+
return plainMatch[1].trim();
|
|
62
|
+
return "";
|
|
63
|
+
}
|
|
64
|
+
function isPdfContentDisposition(value) {
|
|
65
|
+
const text = String(value ?? "").toLowerCase();
|
|
66
|
+
return text.includes("application/pdf") || text.includes(".pdf");
|
|
67
|
+
}
|
|
68
|
+
function sanitizeResourceFileName(value, fallback) {
|
|
69
|
+
const name = String(value ?? "").trim() || fallback;
|
|
70
|
+
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
71
|
+
return cleaned || fallback;
|
|
72
|
+
}
|
|
73
|
+
function sanitizePdfFileName(value, fallback) {
|
|
74
|
+
const cleaned = sanitizeResourceFileName(value, fallback);
|
|
75
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
|
|
76
|
+
}
|
|
77
|
+
function pdfTextRowsSchema() {
|
|
78
|
+
return {
|
|
79
|
+
title: "PdfTextPage",
|
|
80
|
+
description: "Extracted PDF page text",
|
|
81
|
+
schema: {
|
|
82
|
+
type: "object",
|
|
83
|
+
additionalProperties: false,
|
|
84
|
+
required: ["fileId", "fileName", "pageNumber", "text"],
|
|
85
|
+
properties: {
|
|
86
|
+
fileId: { type: "string" },
|
|
87
|
+
fileName: { type: "string" },
|
|
88
|
+
pageNumber: { type: "number" },
|
|
89
|
+
text: { type: "string" },
|
|
90
|
+
},
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function parseJsonlDataRows(content) {
|
|
95
|
+
return String(content ?? "")
|
|
96
|
+
.split(/\r?\n/g)
|
|
97
|
+
.map((line) => line.trim())
|
|
98
|
+
.filter(Boolean)
|
|
99
|
+
.map((line) => JSON.parse(line))
|
|
100
|
+
.map((record) => record?.data)
|
|
101
|
+
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
102
|
+
}
|
|
103
|
+
async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
|
|
104
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
|
|
105
|
+
if (!isPdfContentDisposition(file.contentDisposition))
|
|
106
|
+
return null;
|
|
107
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
108
|
+
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
109
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
|
|
110
|
+
const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
|
|
111
|
+
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
112
|
+
await runDatasetSandboxCommandStep({
|
|
113
|
+
runtime: state.runtime,
|
|
114
|
+
sandboxId,
|
|
115
|
+
cmd: "mkdir",
|
|
116
|
+
args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
|
|
117
|
+
});
|
|
118
|
+
await writeDatasetSandboxFilesStep({
|
|
119
|
+
runtime: state.runtime,
|
|
120
|
+
sandboxId,
|
|
121
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
122
|
+
});
|
|
123
|
+
const install = await runDatasetSandboxCommandStep({
|
|
124
|
+
runtime: state.runtime,
|
|
125
|
+
sandboxId,
|
|
126
|
+
cmd: "python",
|
|
127
|
+
args: ["-m", "pip", "install", "pypdf", "--quiet"],
|
|
128
|
+
});
|
|
129
|
+
if (install.exitCode !== 0) {
|
|
130
|
+
throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
|
|
131
|
+
}
|
|
132
|
+
await writeDatasetSandboxTextFilesStep({
|
|
133
|
+
runtime: state.runtime,
|
|
134
|
+
sandboxId,
|
|
135
|
+
files: [
|
|
136
|
+
{
|
|
137
|
+
path: scriptPath,
|
|
138
|
+
content: [
|
|
139
|
+
"from pathlib import Path",
|
|
140
|
+
"import json",
|
|
141
|
+
"import sys",
|
|
142
|
+
"from pypdf import PdfReader",
|
|
143
|
+
"",
|
|
144
|
+
"resource_path = Path(sys.argv[1])",
|
|
145
|
+
"output_path = Path(sys.argv[2])",
|
|
146
|
+
"file_id = sys.argv[3]",
|
|
147
|
+
"file_name = sys.argv[4]",
|
|
148
|
+
"reader = PdfReader(str(resource_path))",
|
|
149
|
+
"rows = 0",
|
|
150
|
+
"with output_path.open('w', encoding='utf-8') as out:",
|
|
151
|
+
" for index, page in enumerate(reader.pages, start=1):",
|
|
152
|
+
" text = page.extract_text() or ''",
|
|
153
|
+
" text = text.replace('\\x00', '').strip()",
|
|
154
|
+
" if not text:",
|
|
155
|
+
" continue",
|
|
156
|
+
" data = {",
|
|
157
|
+
" 'fileId': file_id,",
|
|
158
|
+
" 'fileName': file_name,",
|
|
159
|
+
" 'pageNumber': index,",
|
|
160
|
+
" 'text': text,",
|
|
161
|
+
" }",
|
|
162
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
163
|
+
" rows += 1",
|
|
164
|
+
" if rows == 0:",
|
|
165
|
+
" data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
|
|
166
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
167
|
+
" rows = 1",
|
|
168
|
+
"print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
|
|
169
|
+
"",
|
|
170
|
+
].join("\n"),
|
|
171
|
+
},
|
|
172
|
+
],
|
|
173
|
+
});
|
|
174
|
+
const extraction = await runDatasetSandboxCommandStep({
|
|
175
|
+
runtime: state.runtime,
|
|
176
|
+
sandboxId,
|
|
177
|
+
cmd: "python",
|
|
178
|
+
args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
|
|
179
|
+
});
|
|
180
|
+
if (extraction.exitCode !== 0) {
|
|
181
|
+
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
182
|
+
}
|
|
183
|
+
const output = await readDatasetSandboxTextFileStep({
|
|
184
|
+
runtime: state.runtime,
|
|
185
|
+
sandboxId,
|
|
186
|
+
path: outputPath,
|
|
187
|
+
});
|
|
188
|
+
const rows = parseJsonlDataRows(output.content);
|
|
189
|
+
if (rows.length === 0) {
|
|
190
|
+
throw new Error("dataset_pdf_text_extraction_empty");
|
|
191
|
+
}
|
|
192
|
+
await materializeRowsToDataset(state.runtime, {
|
|
193
|
+
datasetId: targetDatasetId,
|
|
194
|
+
sandboxId,
|
|
195
|
+
title: state.title ?? fileName,
|
|
196
|
+
instructions: state.instructions,
|
|
197
|
+
contextId: state.contextId ?? "",
|
|
198
|
+
rows,
|
|
199
|
+
schema: pdfTextRowsSchema(),
|
|
200
|
+
first: state.first,
|
|
201
|
+
});
|
|
202
|
+
return targetDatasetId;
|
|
203
|
+
}
|
|
204
|
+
async function materializeRawTextResource(state, resource, targetDatasetId) {
|
|
205
|
+
const rows = materializeRawTextRows(resource);
|
|
206
|
+
await materializeRowsToDataset(state.runtime, {
|
|
207
|
+
datasetId: targetDatasetId,
|
|
208
|
+
sandboxId: state.sandboxId,
|
|
209
|
+
title: state.title ?? resource.name ?? targetDatasetId,
|
|
210
|
+
instructions: state.instructions,
|
|
211
|
+
contextId: state.contextId ?? "",
|
|
212
|
+
rows,
|
|
213
|
+
schema: state.outputSchema,
|
|
214
|
+
first: state.first,
|
|
215
|
+
});
|
|
216
|
+
return targetDatasetId;
|
|
217
|
+
}
|
|
218
|
+
async function writePreparedFileResourceToSandbox(params) {
|
|
219
|
+
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
220
|
+
const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
|
|
221
|
+
const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
|
|
222
|
+
const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
|
|
223
|
+
await runDatasetSandboxCommandStep({
|
|
224
|
+
runtime: params.runtime,
|
|
225
|
+
sandboxId: params.sandboxId,
|
|
226
|
+
cmd: "mkdir",
|
|
227
|
+
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
228
|
+
});
|
|
229
|
+
await writeDatasetSandboxFilesStep({
|
|
230
|
+
runtime: params.runtime,
|
|
231
|
+
sandboxId: params.sandboxId,
|
|
232
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
233
|
+
});
|
|
234
|
+
return { fileName, resourcePath };
|
|
235
|
+
}
|
|
236
|
+
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
237
|
+
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
238
|
+
if (sandboxId)
|
|
239
|
+
return sandboxId;
|
|
240
|
+
throw new Error("dataset_sandbox_required");
|
|
241
|
+
}
|
|
242
|
+
export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
243
|
+
if (!requestedDurable)
|
|
244
|
+
return false;
|
|
245
|
+
try {
|
|
246
|
+
const { getWorkflowMetadata } = await import("workflow");
|
|
247
|
+
const workflowRunId = getWorkflowMetadata?.()?.workflowRunId;
|
|
248
|
+
if (workflowRunId)
|
|
249
|
+
return false;
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// Outside Workflow runtime there is no active metadata, so honor the caller.
|
|
253
|
+
}
|
|
254
|
+
return true;
|
|
255
|
+
}
|
|
256
|
+
export async function initializeDatasetStep(params) {
|
|
257
|
+
"use step";
|
|
258
|
+
await createOrUpdateDatasetMetadata(params.runtime, {
|
|
259
|
+
datasetId: params.datasetId,
|
|
260
|
+
sandboxId: params.sandboxId,
|
|
261
|
+
title: params.title ?? params.datasetId,
|
|
262
|
+
instructions: params.instructions,
|
|
263
|
+
contextId: params.contextId,
|
|
264
|
+
schema: params.schema,
|
|
265
|
+
status: "building",
|
|
266
|
+
});
|
|
267
|
+
return {
|
|
268
|
+
datasetId: params.datasetId,
|
|
269
|
+
sandboxId: params.sandboxId,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
export async function prepareDatasetResourcesStep(params) {
|
|
273
|
+
"use step";
|
|
274
|
+
if (params.kind === "file") {
|
|
275
|
+
const fileId = params.resource.kind === "file"
|
|
276
|
+
? params.resource.fileId
|
|
277
|
+
: await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
|
|
278
|
+
return {
|
|
279
|
+
kind: "file",
|
|
280
|
+
datasetId: params.datasetId,
|
|
281
|
+
sandboxId: params.sandboxId,
|
|
282
|
+
fileId,
|
|
283
|
+
sandboxState: { initialized: false, filePath: "" },
|
|
284
|
+
filePreview: undefined,
|
|
285
|
+
schema: params.schema ?? null,
|
|
286
|
+
filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
|
|
287
|
+
mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
return {
|
|
291
|
+
kind: "transform",
|
|
292
|
+
datasetId: params.datasetId,
|
|
293
|
+
sandboxId: params.sandboxId,
|
|
294
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
295
|
+
outputSchema: params.outputSchema,
|
|
296
|
+
sandboxState: { initialized: false, inputPaths: [] },
|
|
297
|
+
inputPreviews: undefined,
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
export async function initializeDatasetContextStep(params) {
|
|
301
|
+
"use step";
|
|
302
|
+
if (params.prepared.kind === "file") {
|
|
303
|
+
return {
|
|
304
|
+
...params.prepared,
|
|
305
|
+
instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
|
|
306
|
+
prompt: "generate a dataset for this file",
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
return {
|
|
310
|
+
...params.prepared,
|
|
311
|
+
instructions: params.instructions,
|
|
312
|
+
prompt: params.prepared.inputDatasetIds.length === 1
|
|
313
|
+
? "Transform the input dataset into a new dataset matching the provided output schema"
|
|
314
|
+
: `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
export async function completeDatasetStep(params) {
|
|
318
|
+
"use step";
|
|
319
|
+
let datasetResult = await datasetGetByIdStep({
|
|
320
|
+
runtime: params.runtime,
|
|
321
|
+
datasetId: params.datasetId,
|
|
322
|
+
});
|
|
323
|
+
if (!datasetResult.ok)
|
|
324
|
+
throw new Error(datasetResult.error);
|
|
325
|
+
if (!params.schema && !datasetResult.data?.schema) {
|
|
326
|
+
await datasetInferAndUpdateSchemaStep({
|
|
327
|
+
runtime: params.runtime,
|
|
328
|
+
datasetId: params.datasetId,
|
|
329
|
+
title: `${params.datasetId}Row`,
|
|
330
|
+
description: "One dataset row",
|
|
331
|
+
});
|
|
332
|
+
datasetResult = await datasetGetByIdStep({
|
|
333
|
+
runtime: params.runtime,
|
|
334
|
+
datasetId: params.datasetId,
|
|
335
|
+
});
|
|
336
|
+
if (!datasetResult.ok)
|
|
337
|
+
throw new Error(datasetResult.error);
|
|
338
|
+
}
|
|
339
|
+
const previewResult = await datasetPreviewRowsStep({
|
|
340
|
+
runtime: params.runtime,
|
|
341
|
+
datasetId: params.datasetId,
|
|
342
|
+
limit: 20,
|
|
343
|
+
});
|
|
344
|
+
if (!params.first) {
|
|
345
|
+
return {
|
|
346
|
+
datasetId: params.datasetId,
|
|
347
|
+
dataset: datasetResult.data,
|
|
348
|
+
previewRows: previewResult.rows,
|
|
349
|
+
firstRow: undefined,
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
const firstResult = await datasetReadOneStep({
|
|
353
|
+
runtime: params.runtime,
|
|
354
|
+
datasetId: params.datasetId,
|
|
355
|
+
});
|
|
356
|
+
return {
|
|
357
|
+
datasetId: params.datasetId,
|
|
358
|
+
dataset: datasetResult.data,
|
|
359
|
+
previewRows: previewResult.rows,
|
|
360
|
+
firstRow: firstResult.row,
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
|
|
364
|
+
if (resource.kind === "file" && !state.outputSchema) {
|
|
365
|
+
const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
|
|
366
|
+
if (materializedPdf)
|
|
367
|
+
return materializedPdf;
|
|
368
|
+
}
|
|
369
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
370
|
+
if (!state.reactor) {
|
|
371
|
+
throw new Error("dataset_reactor_required");
|
|
372
|
+
}
|
|
373
|
+
await initializeDatasetStep({
|
|
374
|
+
runtime: state.runtime,
|
|
375
|
+
datasetId: targetDatasetId,
|
|
376
|
+
sandboxId,
|
|
377
|
+
title: state.title ?? targetDatasetId,
|
|
378
|
+
instructions: state.instructions,
|
|
379
|
+
contextId: state.contextId ?? "",
|
|
380
|
+
schema: state.outputSchema,
|
|
381
|
+
});
|
|
382
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
383
|
+
kind: "file",
|
|
384
|
+
runtime: state.runtime,
|
|
385
|
+
datasetId: targetDatasetId,
|
|
386
|
+
sandboxId,
|
|
387
|
+
resource,
|
|
388
|
+
schema: state.outputSchema,
|
|
389
|
+
});
|
|
390
|
+
if (prepared.kind !== "file") {
|
|
391
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
392
|
+
}
|
|
393
|
+
const preparedFile = await writePreparedFileResourceToSandbox({
|
|
394
|
+
runtime: state.runtime,
|
|
395
|
+
sandboxId,
|
|
396
|
+
datasetId: targetDatasetId,
|
|
397
|
+
fileId: prepared.fileId,
|
|
398
|
+
filename: prepared.filename,
|
|
399
|
+
});
|
|
400
|
+
const context = await initializeDatasetContextStep({
|
|
401
|
+
prepared: {
|
|
402
|
+
...prepared,
|
|
403
|
+
filename: prepared.filename ?? preparedFile.fileName,
|
|
404
|
+
},
|
|
405
|
+
instructions: state.instructions,
|
|
406
|
+
outputSchema: state.outputSchema,
|
|
407
|
+
});
|
|
408
|
+
if (context.kind !== "file") {
|
|
409
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
410
|
+
}
|
|
411
|
+
const parseContext = createFileParseContext(context.fileId, {
|
|
412
|
+
datasetId: context.datasetId,
|
|
413
|
+
instructions: context.instructions,
|
|
414
|
+
reactor: state.reactor,
|
|
415
|
+
sandboxId: context.sandboxId,
|
|
416
|
+
sandboxState: context.sandboxState,
|
|
417
|
+
filePreview: context.filePreview,
|
|
418
|
+
schema: context.schema,
|
|
419
|
+
filename: context.filename,
|
|
420
|
+
mediaType: context.mediaType,
|
|
421
|
+
});
|
|
422
|
+
await parseContext.parse(state.runtime, {
|
|
423
|
+
durable: await resolveDatasetAgentDurable(state.durable),
|
|
424
|
+
prompt: context.prompt,
|
|
425
|
+
initialContent: {
|
|
426
|
+
datasetId: context.datasetId,
|
|
427
|
+
fileId: context.fileId,
|
|
428
|
+
instructions: context.instructions ?? "",
|
|
429
|
+
sandboxId: context.sandboxId,
|
|
430
|
+
sandboxState: context.sandboxState,
|
|
431
|
+
filePreview: context.filePreview,
|
|
432
|
+
schema: context.schema,
|
|
433
|
+
filename: context.filename,
|
|
434
|
+
mediaType: context.mediaType,
|
|
435
|
+
},
|
|
436
|
+
});
|
|
437
|
+
return targetDatasetId;
|
|
438
|
+
}
|
|
439
|
+
async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
|
|
440
|
+
if (resource.kind === "dataset") {
|
|
441
|
+
return resource.datasetId;
|
|
442
|
+
}
|
|
443
|
+
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
|
|
444
|
+
if (resource.kind === "query") {
|
|
445
|
+
await materializeQueryResource(state.runtime, resource, {
|
|
446
|
+
datasetId: intermediateDatasetId,
|
|
447
|
+
sandboxId: state.sandboxId,
|
|
448
|
+
title: resource.title,
|
|
449
|
+
first: false,
|
|
450
|
+
contextId: state.contextId ?? "",
|
|
451
|
+
});
|
|
452
|
+
return intermediateDatasetId;
|
|
453
|
+
}
|
|
454
|
+
if (resource.kind === "text") {
|
|
455
|
+
await materializeRawTextResource({
|
|
456
|
+
...state,
|
|
457
|
+
outputSchema: undefined,
|
|
458
|
+
first: false,
|
|
459
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
460
|
+
title: resource.name ?? state.title,
|
|
461
|
+
}, resource, intermediateDatasetId);
|
|
462
|
+
return intermediateDatasetId;
|
|
463
|
+
}
|
|
464
|
+
if (resource.kind === "context") {
|
|
465
|
+
throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
|
|
466
|
+
}
|
|
467
|
+
await materializeSingleFileLikeResource({
|
|
468
|
+
...state,
|
|
469
|
+
outputSchema: undefined,
|
|
470
|
+
first: false,
|
|
471
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
472
|
+
}, resource, intermediateDatasetId);
|
|
473
|
+
return intermediateDatasetId;
|
|
474
|
+
}
|
|
475
|
+
export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
476
|
+
if (!state.reactor) {
|
|
477
|
+
throw new Error("dataset_reactor_required");
|
|
478
|
+
}
|
|
479
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
480
|
+
const stateWithSandbox = { ...state, sandboxId };
|
|
481
|
+
const inputDatasetIds = (stateWithSandbox.contextResources ?? []).map((resource, index) => String(resource.datasetId ?? resource.key ?? `resource_${index + 1}`));
|
|
482
|
+
const transformSchema = stateWithSandbox.outputSchema ??
|
|
483
|
+
{
|
|
484
|
+
title: "DatasetRow",
|
|
485
|
+
description: "One dataset row",
|
|
486
|
+
schema: {
|
|
487
|
+
type: "object",
|
|
488
|
+
additionalProperties: true,
|
|
489
|
+
properties: {},
|
|
490
|
+
},
|
|
491
|
+
};
|
|
492
|
+
await initializeDatasetStep({
|
|
493
|
+
runtime: stateWithSandbox.runtime,
|
|
494
|
+
datasetId: targetDatasetId,
|
|
495
|
+
sandboxId,
|
|
496
|
+
title: stateWithSandbox.title ?? targetDatasetId,
|
|
497
|
+
instructions: stateWithSandbox.instructions,
|
|
498
|
+
contextId: stateWithSandbox.contextId ?? "",
|
|
499
|
+
schema: transformSchema,
|
|
500
|
+
});
|
|
501
|
+
const prepared = {
|
|
502
|
+
kind: "transform",
|
|
503
|
+
datasetId: targetDatasetId,
|
|
504
|
+
sandboxId,
|
|
505
|
+
inputDatasetIds,
|
|
506
|
+
outputSchema: transformSchema,
|
|
507
|
+
sandboxState: { initialized: false, inputPaths: [] },
|
|
508
|
+
inputPreviews: undefined,
|
|
509
|
+
};
|
|
510
|
+
const context = await initializeDatasetContextStep({
|
|
511
|
+
prepared,
|
|
512
|
+
instructions: buildTransformInstructions(inputDatasetIds.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
513
|
+
outputSchema: transformSchema,
|
|
514
|
+
});
|
|
515
|
+
if (context.kind !== "transform") {
|
|
516
|
+
throw new Error("dataset_context_kind_mismatch:transform");
|
|
517
|
+
}
|
|
518
|
+
const transformContext = createTransformDatasetContext({
|
|
519
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
520
|
+
outputSchema: context.outputSchema,
|
|
521
|
+
instructions: context.instructions,
|
|
522
|
+
datasetId: context.datasetId,
|
|
523
|
+
reactor: stateWithSandbox.reactor,
|
|
524
|
+
sandboxId: context.sandboxId,
|
|
525
|
+
sandboxState: context.sandboxState,
|
|
526
|
+
inputPreviews: context.inputPreviews,
|
|
527
|
+
contextResources: stateWithSandbox.contextResources ?? [],
|
|
528
|
+
});
|
|
529
|
+
await transformContext.transform(stateWithSandbox.runtime, {
|
|
530
|
+
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
531
|
+
prompt: context.prompt,
|
|
532
|
+
initialContent: {
|
|
533
|
+
datasetId: context.datasetId,
|
|
534
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
535
|
+
outputSchema: context.outputSchema,
|
|
536
|
+
instructions: context.instructions,
|
|
537
|
+
sandboxId: context.sandboxId,
|
|
538
|
+
sandboxState: context.sandboxState,
|
|
539
|
+
inputPreviews: context.inputPreviews,
|
|
540
|
+
contextResources: stateWithSandbox.contextResources ?? [],
|
|
541
|
+
},
|
|
542
|
+
});
|
|
543
|
+
return targetDatasetId;
|
|
544
|
+
}
|
|
545
|
+
registerDatasetAgentMaterializers({
|
|
546
|
+
materializeSingleFileLikeResource,
|
|
547
|
+
materializeDerivedDataset,
|
|
548
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
|
|
2
|
+
export declare function materializeQueryResource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], resource: Extract<InternalDatasetResource, {
|
|
3
|
+
kind: "query";
|
|
4
|
+
}>, params: {
|
|
5
|
+
datasetId: string;
|
|
6
|
+
sandboxId?: string;
|
|
7
|
+
schema?: DatasetSchemaInput;
|
|
8
|
+
title?: string;
|
|
9
|
+
instructions?: string;
|
|
10
|
+
first?: boolean;
|
|
11
|
+
contextId: string;
|
|
12
|
+
}): Promise<string>;
|