@ekairos/dataset 1.22.39-beta.development.0 → 1.22.41-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +347 -0
- package/dist/builder/instructions.d.ts +6 -0
- package/dist/builder/instructions.d.ts.map +1 -0
- package/dist/builder/instructions.js +46 -0
- package/dist/builder/instructions.js.map +1 -0
- package/dist/builder/materialize.d.ts +16 -0
- package/dist/builder/materialize.d.ts.map +1 -0
- package/dist/builder/materialize.js +205 -0
- package/dist/builder/materialize.js.map +1 -0
- package/dist/builder/persistence.d.ts +18 -0
- package/dist/builder/persistence.d.ts.map +1 -0
- package/dist/builder/persistence.js +147 -0
- package/dist/builder/persistence.js.map +1 -0
- package/dist/builder/schemaInference.d.ts +4 -0
- package/dist/builder/schemaInference.d.ts.map +1 -0
- package/dist/builder/schemaInference.js +69 -0
- package/dist/builder/schemaInference.js.map +1 -0
- package/dist/builder/sourceRows.d.ts +8 -0
- package/dist/builder/sourceRows.d.ts.map +1 -0
- package/dist/builder/sourceRows.js +62 -0
- package/dist/builder/sourceRows.js.map +1 -0
- package/dist/builder/types.d.ts +130 -0
- package/dist/builder/types.d.ts.map +1 -0
- package/dist/builder/types.js +3 -0
- package/dist/builder/types.js.map +1 -0
- package/dist/dataset.d.ts +3 -66
- package/dist/dataset.d.ts.map +1 -1
- package/dist/dataset.js +85 -499
- package/dist/dataset.js.map +1 -1
- package/dist/file/file-dataset.agent.d.ts.map +1 -1
- package/dist/file/file-dataset.agent.js +3 -1
- package/dist/file/file-dataset.agent.js.map +1 -1
- package/dist/materializeDataset.tool.d.ts +18 -5
- package/dist/materializeDataset.tool.d.ts.map +1 -1
- package/dist/materializeDataset.tool.js +14 -4
- package/dist/materializeDataset.tool.js.map +1 -1
- package/dist/schema.d.ts +1 -10
- package/dist/schema.d.ts.map +1 -1
- package/dist/schema.js +1 -12
- package/dist/schema.js.map +1 -1
- package/dist/transform/transform-dataset.agent.d.ts.map +1 -1
- package/dist/transform/transform-dataset.agent.js +3 -1
- package/dist/transform/transform-dataset.agent.js.map +1 -1
- package/package.json +10 -4
package/dist/dataset.js
CHANGED
|
@@ -1,493 +1,23 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
3
|
exports.dataset = dataset;
|
|
7
4
|
const admin_1 = require("@instantdb/admin");
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const ajv = new ajv_1.default({ allErrors: true, strict: false });
|
|
15
|
-
function defaultTextSourceName(source) {
|
|
16
|
-
if (source.name?.trim())
|
|
17
|
-
return source.name.trim();
|
|
18
|
-
const mimeType = String(source.mimeType ?? "").toLowerCase();
|
|
19
|
-
if (mimeType.includes("csv"))
|
|
20
|
-
return "source.csv";
|
|
21
|
-
if (mimeType.includes("json"))
|
|
22
|
-
return "source.json";
|
|
23
|
-
if (mimeType.includes("yaml") || mimeType.includes("yml"))
|
|
24
|
-
return "source.yaml";
|
|
25
|
-
return "source.txt";
|
|
26
|
-
}
|
|
27
|
-
function inferJsonSchemaType(value) {
|
|
28
|
-
if (value === null)
|
|
29
|
-
return { type: "null" };
|
|
30
|
-
if (Array.isArray(value))
|
|
31
|
-
return { type: "array" };
|
|
32
|
-
switch (typeof value) {
|
|
33
|
-
case "number":
|
|
34
|
-
return { type: "number" };
|
|
35
|
-
case "boolean":
|
|
36
|
-
return { type: "boolean" };
|
|
37
|
-
case "object":
|
|
38
|
-
return { type: "object", additionalProperties: true };
|
|
39
|
-
default:
|
|
40
|
-
return { type: "string" };
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
function inferDatasetSchema(rows, title = "DatasetRow", description = "One dataset row") {
|
|
44
|
-
const properties = {};
|
|
45
|
-
const required = [];
|
|
46
|
-
const keys = new Set();
|
|
47
|
-
for (const row of rows) {
|
|
48
|
-
if (!row || typeof row !== "object")
|
|
49
|
-
continue;
|
|
50
|
-
for (const key of Object.keys(row)) {
|
|
51
|
-
keys.add(key);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
for (const key of keys) {
|
|
55
|
-
const values = rows.map((row) => (row && typeof row === "object" ? row[key] : undefined));
|
|
56
|
-
const firstDefined = values.find((value) => value !== undefined);
|
|
57
|
-
properties[key] = {
|
|
58
|
-
...inferJsonSchemaType(firstDefined),
|
|
59
|
-
description: `${key} value`,
|
|
60
|
-
};
|
|
61
|
-
if (values.every((value) => value !== undefined)) {
|
|
62
|
-
required.push(key);
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
return {
|
|
66
|
-
title,
|
|
67
|
-
description,
|
|
68
|
-
schema: {
|
|
69
|
-
type: "object",
|
|
70
|
-
additionalProperties: false,
|
|
71
|
-
properties,
|
|
72
|
-
required,
|
|
73
|
-
},
|
|
74
|
-
};
|
|
75
|
-
}
|
|
76
|
-
function validateRows(rows, schema) {
|
|
77
|
-
const validator = ajv.compile(schema.schema);
|
|
78
|
-
for (const row of rows) {
|
|
79
|
-
const valid = validator(row);
|
|
80
|
-
if (!valid) {
|
|
81
|
-
const error = validator.errors?.map((entry) => entry.message || "validation_error").join("; ");
|
|
82
|
-
throw new Error(error || "dataset_schema_validation_failed");
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
function rowsToJsonl(rows) {
|
|
87
|
-
return rows
|
|
88
|
-
.map((row) => JSON.stringify({
|
|
89
|
-
type: "row",
|
|
90
|
-
data: row,
|
|
91
|
-
}))
|
|
92
|
-
.join("\n")
|
|
93
|
-
.concat(rows.length > 0 ? "\n" : "");
|
|
94
|
-
}
|
|
95
|
-
function normalizeQueryRows(result) {
|
|
96
|
-
if (!result || typeof result !== "object")
|
|
97
|
-
return [];
|
|
98
|
-
const entries = Object.entries(result);
|
|
99
|
-
if (entries.length === 0)
|
|
100
|
-
return [];
|
|
101
|
-
if (entries.length === 1) {
|
|
102
|
-
const [key, value] = entries[0];
|
|
103
|
-
if (Array.isArray(value)) {
|
|
104
|
-
return value.map((row) => (row && typeof row === "object" ? row : { value: row }));
|
|
105
|
-
}
|
|
106
|
-
if (value && typeof value === "object") {
|
|
107
|
-
return [value];
|
|
108
|
-
}
|
|
109
|
-
return [{ [key]: value }];
|
|
110
|
-
}
|
|
111
|
-
const rows = [];
|
|
112
|
-
for (const [key, value] of entries) {
|
|
113
|
-
if (Array.isArray(value)) {
|
|
114
|
-
for (const row of value) {
|
|
115
|
-
if (row && typeof row === "object") {
|
|
116
|
-
rows.push({ __entity: key, ...row });
|
|
117
|
-
}
|
|
118
|
-
else {
|
|
119
|
-
rows.push({ __entity: key, value: row });
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
continue;
|
|
123
|
-
}
|
|
124
|
-
if (value && typeof value === "object") {
|
|
125
|
-
rows.push({ __entity: key, ...value });
|
|
126
|
-
continue;
|
|
127
|
-
}
|
|
128
|
-
rows.push({ __entity: key, value });
|
|
129
|
-
}
|
|
130
|
-
return rows;
|
|
131
|
-
}
|
|
132
|
-
function getDomainDescriptor(domain) {
|
|
133
|
-
const meta = domain?.meta ?? {};
|
|
134
|
-
const context = typeof domain?.context === "function" ? domain.context() : {};
|
|
135
|
-
const name = String(meta?.name ?? context?.name ?? "domain");
|
|
136
|
-
const packageName = String(meta?.packageName ?? "");
|
|
137
|
-
return {
|
|
138
|
-
domainName: name,
|
|
139
|
-
...(packageName ? { domainPackageName: packageName } : {}),
|
|
140
|
-
};
|
|
141
|
-
}
|
|
142
|
-
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
143
|
-
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
144
|
-
}
|
|
145
|
-
function buildFileDefaultInstructions(schema) {
|
|
146
|
-
if (schema) {
|
|
147
|
-
return "Create a dataset from the source file and ensure each output row matches the provided dataset schema exactly.";
|
|
148
|
-
}
|
|
149
|
-
return "Create a dataset representing the source content as structured rows.";
|
|
150
|
-
}
|
|
151
|
-
function buildRawSourceInstructions(sourceKind) {
|
|
152
|
-
if (sourceKind === "text") {
|
|
153
|
-
return "Create a dataset representing the raw text content as structured rows without applying business transformations.";
|
|
154
|
-
}
|
|
155
|
-
return "Create a dataset representing the raw file content as structured rows without applying business transformations.";
|
|
156
|
-
}
|
|
157
|
-
function buildTransformInstructions(sourceCount, userInstructions, schema) {
|
|
158
|
-
const explicit = String(userInstructions ?? "").trim();
|
|
159
|
-
if (explicit)
|
|
160
|
-
return explicit;
|
|
161
|
-
if (sourceCount > 1) {
|
|
162
|
-
if (schema) {
|
|
163
|
-
return "Combine the source datasets into a new dataset that matches the provided output schema exactly.";
|
|
164
|
-
}
|
|
165
|
-
return "Combine the source datasets into one coherent dataset.";
|
|
166
|
-
}
|
|
167
|
-
if (schema) {
|
|
168
|
-
return "Transform the source dataset into a new dataset that matches the provided output schema exactly.";
|
|
169
|
-
}
|
|
170
|
-
return "Transform the source dataset into a new useful dataset.";
|
|
171
|
-
}
|
|
172
|
-
async function getDatasetDb(env) {
|
|
173
|
-
const runtime = (await (0, runtime_2.getContextRuntime)(env));
|
|
174
|
-
return runtime.db;
|
|
175
|
-
}
|
|
176
|
-
async function createOrUpdateDatasetMetadata(env, params) {
|
|
177
|
-
const db = await getDatasetDb(env);
|
|
178
|
-
const service = new service_1.DatasetService(db);
|
|
179
|
-
const result = await service.createDataset({
|
|
180
|
-
id: params.datasetId,
|
|
181
|
-
sandboxId: params.sandboxId,
|
|
182
|
-
title: params.title ?? params.datasetId,
|
|
183
|
-
instructions: params.instructions ?? "",
|
|
184
|
-
sources: params.sources,
|
|
185
|
-
sourceKinds: params.sourceKinds,
|
|
186
|
-
analysis: params.analysis,
|
|
187
|
-
schema: params.schema,
|
|
188
|
-
status: params.status ?? "building",
|
|
189
|
-
organizationId: env.orgId,
|
|
190
|
-
});
|
|
191
|
-
if (!result.ok) {
|
|
192
|
-
throw new Error(result.error);
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
async function materializeRowsToDataset(env, params) {
|
|
196
|
-
if (params.first && params.rows.length > 1) {
|
|
197
|
-
throw new Error("dataset_first_expected_zero_or_one_row");
|
|
198
|
-
}
|
|
199
|
-
const resolvedSchema = params.schema ??
|
|
200
|
-
inferDatasetSchema(params.rows, params.title ? `${params.title}Row` : "DatasetRow", params.title ? `One row for ${params.title}` : "One dataset row");
|
|
201
|
-
validateRows(params.rows, resolvedSchema);
|
|
202
|
-
await createOrUpdateDatasetMetadata(env, {
|
|
203
|
-
datasetId: params.datasetId,
|
|
204
|
-
sandboxId: params.sandboxId,
|
|
205
|
-
title: params.title,
|
|
206
|
-
instructions: params.instructions,
|
|
207
|
-
sources: params.sources,
|
|
208
|
-
sourceKinds: params.sourceKinds,
|
|
209
|
-
analysis: params.analysis,
|
|
210
|
-
schema: resolvedSchema,
|
|
211
|
-
status: "building",
|
|
212
|
-
});
|
|
213
|
-
const db = await getDatasetDb(env);
|
|
214
|
-
const service = new service_1.DatasetService(db);
|
|
215
|
-
const uploadResult = await service.uploadDatasetOutputFile({
|
|
216
|
-
datasetId: params.datasetId,
|
|
217
|
-
fileBuffer: Buffer.from(rowsToJsonl(params.rows), "utf-8"),
|
|
218
|
-
});
|
|
219
|
-
if (!uploadResult.ok) {
|
|
220
|
-
throw new Error(uploadResult.error);
|
|
221
|
-
}
|
|
222
|
-
const statusResult = await service.updateDatasetStatus({
|
|
223
|
-
datasetId: params.datasetId,
|
|
224
|
-
status: "completed",
|
|
225
|
-
calculatedTotalRows: params.rows.length,
|
|
226
|
-
actualGeneratedRowCount: params.rows.length,
|
|
227
|
-
});
|
|
228
|
-
if (!statusResult.ok) {
|
|
229
|
-
throw new Error(statusResult.error);
|
|
230
|
-
}
|
|
231
|
-
return params.datasetId;
|
|
232
|
-
}
|
|
233
|
-
async function uploadInlineTextSource(env, datasetId, source) {
|
|
234
|
-
const db = await getDatasetDb(env);
|
|
235
|
-
const fileName = defaultTextSourceName(source);
|
|
236
|
-
const storagePath = `/dataset/source/${datasetId}/${Date.now()}-${fileName}`;
|
|
237
|
-
const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(source.text, "utf-8"), {
|
|
238
|
-
contentType: source.mimeType ?? "text/plain",
|
|
239
|
-
contentDisposition: fileName,
|
|
240
|
-
});
|
|
241
|
-
const fileId = uploadResult?.data?.id;
|
|
242
|
-
if (!fileId) {
|
|
243
|
-
throw new Error("dataset_text_source_upload_failed");
|
|
244
|
-
}
|
|
245
|
-
return fileId;
|
|
246
|
-
}
|
|
247
|
-
async function finalizeBuildResult(env, datasetId, withFirst) {
|
|
248
|
-
const db = await getDatasetDb(env);
|
|
249
|
-
const service = new service_1.DatasetService(db);
|
|
250
|
-
const datasetResult = await service.getDatasetById(datasetId);
|
|
251
|
-
if (!datasetResult.ok) {
|
|
252
|
-
throw new Error(datasetResult.error);
|
|
253
|
-
}
|
|
254
|
-
const previewResult = await service.previewRows(datasetId, 20);
|
|
255
|
-
if (!previewResult.ok) {
|
|
256
|
-
throw new Error(previewResult.error);
|
|
257
|
-
}
|
|
258
|
-
const reader = {
|
|
259
|
-
async read(cursorOrParams, limit) {
|
|
260
|
-
const params = typeof cursorOrParams === "object" && cursorOrParams !== null
|
|
261
|
-
? cursorOrParams
|
|
262
|
-
: { cursor: cursorOrParams, limit };
|
|
263
|
-
const rowsResult = await service.readRows({
|
|
264
|
-
datasetId,
|
|
265
|
-
cursor: params.cursor,
|
|
266
|
-
limit: params.limit,
|
|
267
|
-
});
|
|
268
|
-
if (!rowsResult.ok) {
|
|
269
|
-
throw new Error(rowsResult.error);
|
|
270
|
-
}
|
|
271
|
-
return rowsResult.data;
|
|
272
|
-
},
|
|
273
|
-
};
|
|
274
|
-
if (!withFirst) {
|
|
275
|
-
return {
|
|
276
|
-
datasetId,
|
|
277
|
-
dataset: datasetResult.data,
|
|
278
|
-
previewRows: previewResult.data,
|
|
279
|
-
reader,
|
|
280
|
-
};
|
|
281
|
-
}
|
|
282
|
-
const firstResult = await service.readOne(datasetId);
|
|
283
|
-
if (!firstResult.ok) {
|
|
284
|
-
throw new Error(firstResult.error);
|
|
285
|
-
}
|
|
286
|
-
return {
|
|
287
|
-
datasetId,
|
|
288
|
-
dataset: datasetResult.data,
|
|
289
|
-
previewRows: previewResult.data,
|
|
290
|
-
reader,
|
|
291
|
-
firstRow: firstResult.data,
|
|
292
|
-
};
|
|
293
|
-
}
|
|
294
|
-
async function materializeQuerySource(env, source, params) {
|
|
295
|
-
const runtime = await (0, runtime_1.resolveRuntime)(source.domain, env);
|
|
296
|
-
const result = await runtime.db.query(source.query);
|
|
297
|
-
const rows = normalizeQueryRows(result);
|
|
298
|
-
const domainDescriptor = getDomainDescriptor(source.domain);
|
|
299
|
-
return await materializeRowsToDataset(env, {
|
|
300
|
-
datasetId: params.datasetId,
|
|
301
|
-
sandboxId: params.sandboxId,
|
|
302
|
-
title: params.title ?? source.title,
|
|
303
|
-
instructions: params.instructions,
|
|
304
|
-
sources: [
|
|
305
|
-
{
|
|
306
|
-
kind: "query",
|
|
307
|
-
query: source.query,
|
|
308
|
-
title: source.title,
|
|
309
|
-
explanation: source.explanation,
|
|
310
|
-
...domainDescriptor,
|
|
311
|
-
},
|
|
312
|
-
],
|
|
313
|
-
sourceKinds: ["query"],
|
|
314
|
-
analysis: {
|
|
315
|
-
query: source.query,
|
|
316
|
-
explanation: source.explanation,
|
|
317
|
-
...domainDescriptor,
|
|
318
|
-
},
|
|
319
|
-
rows,
|
|
320
|
-
schema: params.schema,
|
|
321
|
-
inferSchema: !params.schema,
|
|
322
|
-
first: params.first,
|
|
323
|
-
});
|
|
324
|
-
}
|
|
325
|
-
async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
|
|
326
|
-
if (!state.reactor) {
|
|
327
|
-
throw new Error("dataset_reactor_required");
|
|
328
|
-
}
|
|
329
|
-
if (!state.sandboxId) {
|
|
330
|
-
throw new Error("dataset_sandbox_required");
|
|
331
|
-
}
|
|
332
|
-
const fileId = source.kind === "file"
|
|
333
|
-
? source.fileId
|
|
334
|
-
: await uploadInlineTextSource(state.env, targetDatasetId, source);
|
|
335
|
-
await createOrUpdateDatasetMetadata(state.env, {
|
|
336
|
-
datasetId: targetDatasetId,
|
|
337
|
-
sandboxId: state.sandboxId,
|
|
338
|
-
title: state.title ?? targetDatasetId,
|
|
339
|
-
instructions: state.instructions,
|
|
340
|
-
sources: [
|
|
341
|
-
source.kind === "file"
|
|
342
|
-
? { kind: "file", fileId: source.fileId, description: source.description }
|
|
343
|
-
: {
|
|
344
|
-
kind: "text",
|
|
345
|
-
mimeType: source.mimeType,
|
|
346
|
-
name: source.name,
|
|
347
|
-
description: source.description,
|
|
348
|
-
},
|
|
349
|
-
],
|
|
350
|
-
sourceKinds: [source.kind],
|
|
351
|
-
schema: state.outputSchema,
|
|
352
|
-
status: "building",
|
|
353
|
-
});
|
|
354
|
-
const parseStory = (0, file_dataset_agent_1.createFileParseStory)(fileId, {
|
|
355
|
-
datasetId: targetDatasetId,
|
|
356
|
-
instructions: state.instructions ?? buildFileDefaultInstructions(state.outputSchema),
|
|
357
|
-
reactor: state.reactor,
|
|
358
|
-
sandboxId: state.sandboxId,
|
|
359
|
-
});
|
|
360
|
-
await parseStory.parse(state.env);
|
|
361
|
-
if (!state.outputSchema) {
|
|
362
|
-
const db = await getDatasetDb(state.env);
|
|
363
|
-
const service = new service_1.DatasetService(db);
|
|
364
|
-
const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
|
|
365
|
-
if (!readResult.ok) {
|
|
366
|
-
throw new Error(readResult.error);
|
|
367
|
-
}
|
|
368
|
-
const inferred = inferDatasetSchema(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
|
|
369
|
-
const updateResult = await service.updateDatasetSchema({
|
|
370
|
-
datasetId: targetDatasetId,
|
|
371
|
-
schema: inferred,
|
|
372
|
-
status: "completed",
|
|
373
|
-
});
|
|
374
|
-
if (!updateResult.ok) {
|
|
375
|
-
throw new Error(updateResult.error);
|
|
376
|
-
}
|
|
377
|
-
}
|
|
378
|
-
if (state.first) {
|
|
379
|
-
const db = await getDatasetDb(state.env);
|
|
380
|
-
const service = new service_1.DatasetService(db);
|
|
381
|
-
const firstResult = await service.readOne(targetDatasetId);
|
|
382
|
-
if (!firstResult.ok) {
|
|
383
|
-
throw new Error(firstResult.error);
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
return targetDatasetId;
|
|
387
|
-
}
|
|
388
|
-
async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
|
|
389
|
-
if (source.kind === "dataset") {
|
|
390
|
-
return source.datasetId;
|
|
391
|
-
}
|
|
392
|
-
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, source.kind, sourceIndex);
|
|
393
|
-
if (source.kind === "query") {
|
|
394
|
-
await materializeQuerySource(state.env, source, {
|
|
395
|
-
datasetId: intermediateDatasetId,
|
|
396
|
-
sandboxId: state.sandboxId,
|
|
397
|
-
title: source.title,
|
|
398
|
-
first: false,
|
|
399
|
-
});
|
|
400
|
-
return intermediateDatasetId;
|
|
401
|
-
}
|
|
402
|
-
await materializeSingleFileLikeSource({
|
|
403
|
-
...state,
|
|
404
|
-
outputSchema: undefined,
|
|
405
|
-
first: false,
|
|
406
|
-
instructions: buildRawSourceInstructions(source.kind),
|
|
407
|
-
}, source, intermediateDatasetId);
|
|
408
|
-
return intermediateDatasetId;
|
|
409
|
-
}
|
|
410
|
-
async function materializeDerivedDataset(state, targetDatasetId) {
|
|
411
|
-
if (!state.reactor) {
|
|
412
|
-
throw new Error("dataset_reactor_required");
|
|
413
|
-
}
|
|
414
|
-
if (!state.sandboxId) {
|
|
415
|
-
throw new Error("dataset_sandbox_required");
|
|
416
|
-
}
|
|
417
|
-
const normalizedSources = [];
|
|
418
|
-
for (let index = 0; index < state.sources.length; index++) {
|
|
419
|
-
normalizedSources.push(await normalizeSourceToDatasetId(state, state.sources[index], targetDatasetId, index));
|
|
420
|
-
}
|
|
421
|
-
const transformSchema = state.outputSchema ??
|
|
422
|
-
{
|
|
423
|
-
title: "DatasetRow",
|
|
424
|
-
description: "One dataset row",
|
|
425
|
-
schema: {
|
|
426
|
-
type: "object",
|
|
427
|
-
additionalProperties: true,
|
|
428
|
-
properties: {},
|
|
429
|
-
},
|
|
430
|
-
};
|
|
431
|
-
await createOrUpdateDatasetMetadata(state.env, {
|
|
432
|
-
datasetId: targetDatasetId,
|
|
433
|
-
sandboxId: state.sandboxId,
|
|
434
|
-
title: state.title ?? targetDatasetId,
|
|
435
|
-
instructions: state.instructions,
|
|
436
|
-
sources: state.sources.map((source) => source.kind === "query"
|
|
437
|
-
? {
|
|
438
|
-
kind: "query",
|
|
439
|
-
query: source.query,
|
|
440
|
-
title: source.title,
|
|
441
|
-
explanation: source.explanation,
|
|
442
|
-
...getDomainDescriptor(source.domain),
|
|
443
|
-
}
|
|
444
|
-
: source),
|
|
445
|
-
sourceKinds: state.sources.map((source) => source.kind),
|
|
446
|
-
schema: transformSchema,
|
|
447
|
-
status: "building",
|
|
448
|
-
});
|
|
449
|
-
const transformStory = (0, transform_dataset_agent_1.createTransformDatasetStory)({
|
|
450
|
-
sourceDatasetIds: normalizedSources,
|
|
451
|
-
outputSchema: transformSchema,
|
|
452
|
-
instructions: buildTransformInstructions(normalizedSources.length, state.instructions, state.outputSchema),
|
|
453
|
-
datasetId: targetDatasetId,
|
|
454
|
-
reactor: state.reactor,
|
|
455
|
-
sandboxId: state.sandboxId,
|
|
456
|
-
});
|
|
457
|
-
await transformStory.transform(state.env);
|
|
458
|
-
const db = await getDatasetDb(state.env);
|
|
459
|
-
const service = new service_1.DatasetService(db);
|
|
460
|
-
if (!state.outputSchema) {
|
|
461
|
-
const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
|
|
462
|
-
if (!readResult.ok) {
|
|
463
|
-
throw new Error(readResult.error);
|
|
464
|
-
}
|
|
465
|
-
const inferred = inferDatasetSchema(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
|
|
466
|
-
const updateResult = await service.updateDatasetSchema({
|
|
467
|
-
datasetId: targetDatasetId,
|
|
468
|
-
schema: inferred,
|
|
469
|
-
status: "completed",
|
|
470
|
-
});
|
|
471
|
-
if (!updateResult.ok) {
|
|
472
|
-
throw new Error(updateResult.error);
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
if (state.first) {
|
|
476
|
-
const firstResult = await service.readOne(targetDatasetId);
|
|
477
|
-
if (!firstResult.ok) {
|
|
478
|
-
throw new Error(firstResult.error);
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
return targetDatasetId;
|
|
482
|
-
}
|
|
483
|
-
function dataset(env) {
|
|
5
|
+
const instructions_1 = require("./builder/instructions");
|
|
6
|
+
const materialize_1 = require("./builder/materialize");
|
|
7
|
+
const persistence_1 = require("./builder/persistence");
|
|
8
|
+
function dataset(runtime, options = {}) {
|
|
9
|
+
const datasetId = normalizeDatasetId(options.datasetId);
|
|
10
|
+
const typedRuntime = runtime;
|
|
484
11
|
const state = {
|
|
485
|
-
|
|
12
|
+
runtime: typedRuntime,
|
|
13
|
+
env: typedRuntime.env,
|
|
486
14
|
sources: [],
|
|
15
|
+
output: "rows",
|
|
487
16
|
inferSchema: false,
|
|
488
17
|
first: false,
|
|
489
18
|
};
|
|
490
19
|
const api = {
|
|
20
|
+
datasetId,
|
|
491
21
|
fromFile(source) {
|
|
492
22
|
state.sources.push({ kind: "file", ...source });
|
|
493
23
|
return api;
|
|
@@ -500,6 +30,24 @@ function dataset(env) {
|
|
|
500
30
|
state.sources.push({ kind: "dataset", ...source });
|
|
501
31
|
return api;
|
|
502
32
|
},
|
|
33
|
+
from(...sources) {
|
|
34
|
+
for (const source of sources) {
|
|
35
|
+
if ("kind" in source) {
|
|
36
|
+
state.sources.push(source);
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
if ("fileId" in source) {
|
|
40
|
+
state.sources.push({ kind: "file", ...source });
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
if ("datasetId" in source) {
|
|
44
|
+
state.sources.push({ kind: "dataset", ...source });
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
state.sources.push({ kind: "text", ...source });
|
|
48
|
+
}
|
|
49
|
+
return api;
|
|
50
|
+
},
|
|
503
51
|
fromQuery(domain, source) {
|
|
504
52
|
state.sources.push({ kind: "query", domain, ...source });
|
|
505
53
|
return api;
|
|
@@ -522,6 +70,20 @@ function dataset(env) {
|
|
|
522
70
|
state.inferSchema = true;
|
|
523
71
|
return api;
|
|
524
72
|
},
|
|
73
|
+
auto() {
|
|
74
|
+
state.outputSchema = undefined;
|
|
75
|
+
state.inferSchema = true;
|
|
76
|
+
return api;
|
|
77
|
+
},
|
|
78
|
+
asRows() {
|
|
79
|
+
state.output = "rows";
|
|
80
|
+
return api;
|
|
81
|
+
},
|
|
82
|
+
asObject() {
|
|
83
|
+
state.output = "object";
|
|
84
|
+
state.first = true;
|
|
85
|
+
return api;
|
|
86
|
+
},
|
|
525
87
|
instructions(instructions) {
|
|
526
88
|
state.instructions = instructions;
|
|
527
89
|
return api;
|
|
@@ -538,41 +100,65 @@ function dataset(env) {
|
|
|
538
100
|
if (state.sources.length === 0) {
|
|
539
101
|
throw new Error("dataset_sources_required");
|
|
540
102
|
}
|
|
541
|
-
const targetDatasetId =
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
const
|
|
103
|
+
const targetDatasetId = options?.datasetId
|
|
104
|
+
? normalizeDatasetId(options.datasetId)
|
|
105
|
+
: datasetId;
|
|
106
|
+
const effectiveState = state.output === "object"
|
|
107
|
+
? {
|
|
108
|
+
...state,
|
|
109
|
+
first: true,
|
|
110
|
+
instructions: (0, instructions_1.buildObjectOutputInstructions)(state.instructions),
|
|
111
|
+
}
|
|
112
|
+
: state;
|
|
113
|
+
const onlySource = effectiveState.sources[0];
|
|
114
|
+
const isSingleSource = effectiveState.sources.length === 1;
|
|
115
|
+
const hasInstructions = Boolean(String(effectiveState.instructions ?? "").trim());
|
|
545
116
|
if (isSingleSource && onlySource.kind === "query" && !hasInstructions) {
|
|
546
|
-
await materializeQuerySource(
|
|
117
|
+
await (0, materialize_1.materializeQuerySource)(effectiveState.runtime, onlySource, {
|
|
547
118
|
datasetId: targetDatasetId,
|
|
548
|
-
sandboxId:
|
|
549
|
-
schema:
|
|
550
|
-
title:
|
|
551
|
-
instructions:
|
|
552
|
-
first:
|
|
119
|
+
sandboxId: effectiveState.sandboxId,
|
|
120
|
+
schema: effectiveState.outputSchema,
|
|
121
|
+
title: effectiveState.title ?? onlySource.title,
|
|
122
|
+
instructions: effectiveState.instructions,
|
|
123
|
+
first: effectiveState.first,
|
|
553
124
|
});
|
|
554
|
-
return await finalizeBuildResult(
|
|
125
|
+
return finalizeOutputResult(await (0, persistence_1.finalizeBuildResult)(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
555
126
|
}
|
|
556
127
|
if (isSingleSource && (onlySource.kind === "file" || onlySource.kind === "text")) {
|
|
557
|
-
if (!
|
|
128
|
+
if (!effectiveState.sandboxId) {
|
|
558
129
|
throw new Error("dataset_sandbox_required");
|
|
559
130
|
}
|
|
560
|
-
if (!
|
|
131
|
+
if (!effectiveState.reactor) {
|
|
561
132
|
throw new Error("dataset_reactor_required");
|
|
562
133
|
}
|
|
563
|
-
await materializeSingleFileLikeSource(
|
|
564
|
-
return await finalizeBuildResult(
|
|
134
|
+
await (0, materialize_1.materializeSingleFileLikeSource)(effectiveState, onlySource, targetDatasetId);
|
|
135
|
+
return finalizeOutputResult(await (0, persistence_1.finalizeBuildResult)(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
565
136
|
}
|
|
566
|
-
if (!
|
|
137
|
+
if (!effectiveState.sandboxId) {
|
|
567
138
|
throw new Error("dataset_sandbox_required");
|
|
568
139
|
}
|
|
569
|
-
if (!
|
|
140
|
+
if (!effectiveState.reactor) {
|
|
570
141
|
throw new Error("dataset_reactor_required");
|
|
571
142
|
}
|
|
572
|
-
await materializeDerivedDataset(
|
|
573
|
-
return await finalizeBuildResult(
|
|
143
|
+
await (0, materialize_1.materializeDerivedDataset)(effectiveState, targetDatasetId);
|
|
144
|
+
return finalizeOutputResult(await (0, persistence_1.finalizeBuildResult)(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
574
145
|
},
|
|
575
146
|
};
|
|
576
147
|
return api;
|
|
577
148
|
}
|
|
149
|
+
function normalizeDatasetId(datasetId) {
|
|
150
|
+
const normalized = String(datasetId ?? (0, admin_1.id)()).trim();
|
|
151
|
+
if (!normalized) {
|
|
152
|
+
throw new Error("dataset_id_required");
|
|
153
|
+
}
|
|
154
|
+
return normalized;
|
|
155
|
+
}
|
|
156
|
+
function finalizeOutputResult(result, output) {
|
|
157
|
+
if (output !== "object")
|
|
158
|
+
return result;
|
|
159
|
+
return {
|
|
160
|
+
...result,
|
|
161
|
+
object: result.firstRow ?? null,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
578
164
|
//# sourceMappingURL=dataset.js.map
|