@ekairos/dataset 1.22.79-beta.development.0 → 1.22.80-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.d.ts +77 -1
- package/dist/builder/materialize.js +212 -60
- package/dist/builder/persistence.d.ts +6 -0
- package/dist/builder/persistence.js +22 -0
- package/dist/completeDataset.steps.d.ts +87 -0
- package/dist/completeDataset.steps.js +449 -0
- package/dist/completeDataset.tool.d.ts +53 -2
- package/dist/completeDataset.tool.js +4 -262
- package/dist/dataset/steps.d.ts +1 -0
- package/dist/dataset/steps.js +12 -12
- package/dist/dataset.js +16 -4
- package/dist/datasetFiles.d.ts +5 -0
- package/dist/datasetFiles.js +21 -0
- package/dist/executeCommand.tool.js +2 -3
- package/dist/file/file-dataset.agent.d.ts +4 -1
- package/dist/file/file-dataset.agent.js +30 -18
- package/dist/file/file-dataset.steps.js +3 -3
- package/dist/file/file-dataset.types.d.ts +4 -0
- package/dist/file/prompts.js +108 -4
- package/dist/transform/filepreview.js +2 -3
- package/dist/transform/transform-dataset.agent.d.ts +6 -1
- package/dist/transform/transform-dataset.agent.js +30 -15
- package/dist/transform/transform-dataset.steps.js +3 -4
- package/dist/transform/transform-dataset.types.d.ts +6 -0
- package/package.json +4 -4
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
import Ajv from "ajv";
|
|
2
|
+
import { getDatasetOutputPath } from "./datasetFiles.js";
|
|
3
|
+
import { DatasetService } from "./service.js";
|
|
4
|
+
import { getDatasetRuntimeDb } from "./dataset/steps.js";
|
|
5
|
+
import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, } from "./sandbox/steps.js";
|
|
6
|
+
let ajvInstance = null;
|
|
7
|
+
function getAjv() {
|
|
8
|
+
if (!ajvInstance) {
|
|
9
|
+
ajvInstance = new Ajv({
|
|
10
|
+
allErrors: true,
|
|
11
|
+
strict: false,
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
return ajvInstance;
|
|
15
|
+
}
|
|
16
|
+
export async function persistDatasetStep({ runtime, datasetId, sandboxId, summary }) {
|
|
17
|
+
"use step";
|
|
18
|
+
const outputPath = getDatasetOutputPath(datasetId);
|
|
19
|
+
if (summary) {
|
|
20
|
+
console.log(`[Dataset ${datasetId}] Persisting completed dataset: ${summary}`);
|
|
21
|
+
}
|
|
22
|
+
try {
|
|
23
|
+
await ensureFileExists(runtime, sandboxId, outputPath);
|
|
24
|
+
}
|
|
25
|
+
catch (error) {
|
|
26
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
27
|
+
console.error(`[Dataset ${datasetId}] Missing output file:`, message);
|
|
28
|
+
return {
|
|
29
|
+
success: false,
|
|
30
|
+
status: "missing_output",
|
|
31
|
+
validRows: 0,
|
|
32
|
+
rowRecordCount: 0,
|
|
33
|
+
validation: [],
|
|
34
|
+
error: message,
|
|
35
|
+
message,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
console.log(`[Dataset ${datasetId}] Validating dataset rows against schema`);
|
|
39
|
+
const db = await getDatasetRuntimeDb(runtime);
|
|
40
|
+
const service = new DatasetService(db);
|
|
41
|
+
const datasetResult = await service.getDatasetById(datasetId);
|
|
42
|
+
if (!datasetResult.ok) {
|
|
43
|
+
console.error(`[Dataset ${datasetId}] ${datasetResult.error}`);
|
|
44
|
+
return {
|
|
45
|
+
success: false,
|
|
46
|
+
status: "dataset_not_found",
|
|
47
|
+
validRows: 0,
|
|
48
|
+
rowRecordCount: 0,
|
|
49
|
+
validation: [],
|
|
50
|
+
error: datasetResult.error,
|
|
51
|
+
message: datasetResult.error,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
const datasetRecord = datasetResult.data;
|
|
55
|
+
if (!datasetRecord.schema) {
|
|
56
|
+
console.error(`[Dataset ${datasetId}] Schema not found in database`);
|
|
57
|
+
return {
|
|
58
|
+
success: false,
|
|
59
|
+
status: "schema_missing",
|
|
60
|
+
validRows: 0,
|
|
61
|
+
rowRecordCount: 0,
|
|
62
|
+
validation: [],
|
|
63
|
+
error: "Schema not found in database. Please generate schema first.",
|
|
64
|
+
message: "Schema not found in database. Please generate schema first.",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
const schemaJson = datasetRecord.schema.schema;
|
|
68
|
+
let validator;
|
|
69
|
+
try {
|
|
70
|
+
validator = getAjv().compile(schemaJson);
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
74
|
+
console.error(`[Dataset ${datasetId}] Failed to compile schema:`, message);
|
|
75
|
+
return {
|
|
76
|
+
success: false,
|
|
77
|
+
status: "schema_invalid",
|
|
78
|
+
validRows: 0,
|
|
79
|
+
rowRecordCount: 0,
|
|
80
|
+
validation: [],
|
|
81
|
+
error: `Failed to compile schema: ${message}`,
|
|
82
|
+
message: `Failed to compile schema: ${message}`,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
const validationResult = await validateJsonlRows({
|
|
86
|
+
runtime,
|
|
87
|
+
sandboxId,
|
|
88
|
+
outputPath,
|
|
89
|
+
validator,
|
|
90
|
+
schema: schemaJson,
|
|
91
|
+
datasetId,
|
|
92
|
+
});
|
|
93
|
+
if (!validationResult.success) {
|
|
94
|
+
return validationResult;
|
|
95
|
+
}
|
|
96
|
+
const totalValidRows = validationResult.validRowCount ?? 0;
|
|
97
|
+
const rowRecordCount = validationResult.rowRecordCount ?? totalValidRows;
|
|
98
|
+
console.log(`[Dataset ${datasetId}] Reading file content for upload`);
|
|
99
|
+
const fileRead = await readDatasetSandboxFileStep({ runtime, sandboxId, path: outputPath });
|
|
100
|
+
if (!fileRead.contentBase64) {
|
|
101
|
+
console.error(`[Dataset ${datasetId}] Empty file content`);
|
|
102
|
+
return {
|
|
103
|
+
success: false,
|
|
104
|
+
status: "empty_output",
|
|
105
|
+
validRows: 0,
|
|
106
|
+
rowRecordCount: 0,
|
|
107
|
+
validation: [],
|
|
108
|
+
error: "Empty file content",
|
|
109
|
+
message: "Empty file content",
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
console.log(`[Dataset ${datasetId}] Uploading file to InstantDB storage`);
|
|
113
|
+
const uploadResult = await service.uploadDatasetOutputFile({
|
|
114
|
+
datasetId,
|
|
115
|
+
fileBuffer: Buffer.from(fileRead.contentBase64, "base64"),
|
|
116
|
+
});
|
|
117
|
+
if (!uploadResult.ok) {
|
|
118
|
+
console.error(`[Dataset ${datasetId}] File upload failed: ${uploadResult.error}`);
|
|
119
|
+
return {
|
|
120
|
+
success: false,
|
|
121
|
+
status: "upload_failed",
|
|
122
|
+
validRows: totalValidRows,
|
|
123
|
+
rowRecordCount,
|
|
124
|
+
validation: validationResult.validation,
|
|
125
|
+
error: uploadResult.error,
|
|
126
|
+
message: uploadResult.error,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
console.log(`[Dataset ${datasetId}] File uploaded successfully: ${uploadResult.data.fileId}`);
|
|
130
|
+
const statusResult = await service.updateDatasetStatus({
|
|
131
|
+
datasetId,
|
|
132
|
+
status: "completed",
|
|
133
|
+
calculatedTotalRows: totalValidRows,
|
|
134
|
+
actualGeneratedRowCount: totalValidRows,
|
|
135
|
+
});
|
|
136
|
+
if (!statusResult.ok) {
|
|
137
|
+
console.error(`[Dataset ${datasetId}] Failed to update status: ${statusResult.error}`);
|
|
138
|
+
return {
|
|
139
|
+
success: false,
|
|
140
|
+
status: "status_update_failed",
|
|
141
|
+
validRows: totalValidRows,
|
|
142
|
+
rowRecordCount,
|
|
143
|
+
validation: validationResult.validation,
|
|
144
|
+
error: statusResult.error,
|
|
145
|
+
message: statusResult.error,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
|
|
149
|
+
console.log(`[Dataset ${datasetId}] ========================================`);
|
|
150
|
+
return {
|
|
151
|
+
success: true,
|
|
152
|
+
status: "completed",
|
|
153
|
+
validRows: totalValidRows,
|
|
154
|
+
rowRecordCount,
|
|
155
|
+
fileId: uploadResult.data.fileId,
|
|
156
|
+
storagePath: uploadResult.data.storagePath,
|
|
157
|
+
message: "Dataset creation completed and uploaded to storage",
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
async function ensureFileExists(runtime, sandboxId, path) {
|
|
161
|
+
const result = await runDatasetSandboxCommandStep({
|
|
162
|
+
runtime,
|
|
163
|
+
sandboxId,
|
|
164
|
+
cmd: "test",
|
|
165
|
+
args: ["-f", path],
|
|
166
|
+
});
|
|
167
|
+
if (result.exitCode !== 0) {
|
|
168
|
+
throw new Error(`Required file not found: ${path}`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
function asRecord(value) {
|
|
172
|
+
return value && typeof value === "object" && !Array.isArray(value)
|
|
173
|
+
? value
|
|
174
|
+
: null;
|
|
175
|
+
}
|
|
176
|
+
function joinSchemaPath(basePath, key) {
|
|
177
|
+
return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
|
|
178
|
+
}
|
|
179
|
+
function collectRequiredPaths(schema, path = "$", paths = []) {
|
|
180
|
+
const record = asRecord(schema);
|
|
181
|
+
if (!record)
|
|
182
|
+
return paths;
|
|
183
|
+
const properties = asRecord(record.properties);
|
|
184
|
+
if (properties) {
|
|
185
|
+
const required = Array.isArray(record.required)
|
|
186
|
+
? record.required.filter((value) => typeof value === "string")
|
|
187
|
+
: [];
|
|
188
|
+
for (const key of required) {
|
|
189
|
+
paths.push(joinSchemaPath(path, key));
|
|
190
|
+
}
|
|
191
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
192
|
+
collectRequiredPaths(childSchema, joinSchemaPath(path, key), paths);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
if (record.items) {
|
|
196
|
+
collectRequiredPaths(record.items, `${path}[]`, paths);
|
|
197
|
+
}
|
|
198
|
+
for (const keyword of ["oneOf", "anyOf", "allOf"]) {
|
|
199
|
+
if (Array.isArray(record[keyword])) {
|
|
200
|
+
for (const childSchema of record[keyword]) {
|
|
201
|
+
collectRequiredPaths(childSchema, path, paths);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return [...new Set(paths)];
|
|
206
|
+
}
|
|
207
|
+
function collectEnumConstraints(schema, path = "$", constraints = []) {
|
|
208
|
+
const record = asRecord(schema);
|
|
209
|
+
if (!record)
|
|
210
|
+
return constraints;
|
|
211
|
+
if (Array.isArray(record.enum)) {
|
|
212
|
+
constraints.push({ path, values: record.enum });
|
|
213
|
+
}
|
|
214
|
+
const properties = asRecord(record.properties);
|
|
215
|
+
if (properties) {
|
|
216
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
217
|
+
collectEnumConstraints(childSchema, joinSchemaPath(path, key), constraints);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
if (record.items) {
|
|
221
|
+
collectEnumConstraints(record.items, `${path}[]`, constraints);
|
|
222
|
+
}
|
|
223
|
+
for (const keyword of ["oneOf", "anyOf", "allOf"]) {
|
|
224
|
+
if (Array.isArray(record[keyword])) {
|
|
225
|
+
for (const childSchema of record[keyword]) {
|
|
226
|
+
collectEnumConstraints(childSchema, path, constraints);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return constraints;
|
|
231
|
+
}
|
|
232
|
+
function countValues(values, maxItems = 20) {
|
|
233
|
+
const counts = new Map();
|
|
234
|
+
for (const value of values) {
|
|
235
|
+
counts.set(value, (counts.get(value) ?? 0) + 1);
|
|
236
|
+
}
|
|
237
|
+
return [...counts.entries()]
|
|
238
|
+
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
|
239
|
+
.slice(0, maxItems)
|
|
240
|
+
.map(([value, count]) => ({ value, count }));
|
|
241
|
+
}
|
|
242
|
+
function toErrorDetails(errors) {
|
|
243
|
+
if (!Array.isArray(errors))
|
|
244
|
+
return [];
|
|
245
|
+
return errors.map((err) => ({
|
|
246
|
+
path: err.instancePath || "$",
|
|
247
|
+
keyword: err.keyword,
|
|
248
|
+
message: err.message || "Unknown validation error",
|
|
249
|
+
params: asRecord(err.params) ?? undefined,
|
|
250
|
+
schemaPath: err.schemaPath,
|
|
251
|
+
}));
|
|
252
|
+
}
|
|
253
|
+
function buildValidationFailureSummary(params) {
|
|
254
|
+
const rootSchema = asRecord(params.schema);
|
|
255
|
+
const rootProperties = asRecord(rootSchema?.properties);
|
|
256
|
+
const invalidRows = params.validation.filter((entry) => !entry.valid);
|
|
257
|
+
const errorMessages = invalidRows.flatMap((entry) => entry.errors ?? []);
|
|
258
|
+
const observedTopLevelKeys = [
|
|
259
|
+
...new Set(invalidRows.flatMap((entry) => entry.dataKeys ?? [])),
|
|
260
|
+
].sort((a, b) => a.localeCompare(b));
|
|
261
|
+
const details = invalidRows.flatMap((entry) => entry.errorDetails ?? []);
|
|
262
|
+
const missingRequiredProperties = countValues(details
|
|
263
|
+
.filter((detail) => detail.keyword === "required")
|
|
264
|
+
.map((detail) => String(detail.params?.missingProperty ?? "unknown"))).map(({ value, count }) => ({ property: value, count }));
|
|
265
|
+
const additionalProperties = countValues(details
|
|
266
|
+
.filter((detail) => detail.keyword === "additionalProperties")
|
|
267
|
+
.map((detail) => String(detail.params?.additionalProperty ?? "unknown"))).map(({ value, count }) => ({ property: value, count }));
|
|
268
|
+
const enumFailureCounts = new Map();
|
|
269
|
+
for (const detail of details.filter((entry) => entry.keyword === "enum")) {
|
|
270
|
+
const key = `${detail.path}:${JSON.stringify(detail.params?.allowedValues ?? [])}`;
|
|
271
|
+
const current = enumFailureCounts.get(key);
|
|
272
|
+
enumFailureCounts.set(key, {
|
|
273
|
+
path: detail.path,
|
|
274
|
+
allowedValues: Array.isArray(detail.params?.allowedValues)
|
|
275
|
+
? detail.params.allowedValues
|
|
276
|
+
: [],
|
|
277
|
+
count: (current?.count ?? 0) + 1,
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
return {
|
|
281
|
+
rowRecordCount: params.rowRecordCount,
|
|
282
|
+
validRowCount: params.validRowCount,
|
|
283
|
+
invalidRowCount: invalidRows.length,
|
|
284
|
+
expectedTopLevelKeys: rootProperties ? Object.keys(rootProperties) : [],
|
|
285
|
+
requiredTopLevelKeys: Array.isArray(rootSchema?.required)
|
|
286
|
+
? rootSchema.required.filter((value) => typeof value === "string")
|
|
287
|
+
: [],
|
|
288
|
+
requiredPaths: collectRequiredPaths(params.schema).slice(0, 120),
|
|
289
|
+
enumConstraints: collectEnumConstraints(params.schema).slice(0, 80),
|
|
290
|
+
topErrors: countValues(errorMessages, 20).map(({ value, count }) => ({
|
|
291
|
+
message: value,
|
|
292
|
+
count,
|
|
293
|
+
})),
|
|
294
|
+
missingRequiredProperties,
|
|
295
|
+
additionalProperties,
|
|
296
|
+
enumFailures: [...enumFailureCounts.values()]
|
|
297
|
+
.sort((a, b) => b.count - a.count || a.path.localeCompare(b.path))
|
|
298
|
+
.slice(0, 20),
|
|
299
|
+
observedTopLevelKeys,
|
|
300
|
+
sampleInvalidRows: invalidRows.slice(0, 10).map((entry) => ({
|
|
301
|
+
index: entry.index,
|
|
302
|
+
dataKeys: entry.dataKeys,
|
|
303
|
+
errors: entry.errors?.slice(0, 12),
|
|
304
|
+
})),
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
function buildRepairInstructions(summary) {
|
|
308
|
+
const instructions = [
|
|
309
|
+
"Rewrite output.jsonl using the schema as the source of truth. Do not use source file headers as JSON keys unless they exactly match schema property names.",
|
|
310
|
+
"Each non-empty line must be a JSON object shaped as {\"type\":\"row\",\"data\":{...}}.",
|
|
311
|
+
"Populate every required top-level and nested required path from failureSummary.requiredPaths.",
|
|
312
|
+
"For enum fields, emit exactly one allowed literal from failureSummary.enumConstraints or failureSummary.enumFailures.",
|
|
313
|
+
];
|
|
314
|
+
if (summary.validRowCount === 0 && summary.rowRecordCount > 0) {
|
|
315
|
+
instructions.unshift("All produced row records failed validation; treat the previous output as structurally invalid and regenerate it from scratch.");
|
|
316
|
+
}
|
|
317
|
+
if (summary.additionalProperties.length > 0) {
|
|
318
|
+
instructions.push("Remove unexpected data keys listed in failureSummary.additionalProperties; map their values into schema keys instead.");
|
|
319
|
+
}
|
|
320
|
+
if (summary.missingRequiredProperties.length > 0) {
|
|
321
|
+
instructions.push("Add the missing required properties listed in failureSummary.missingRequiredProperties to each affected row.");
|
|
322
|
+
}
|
|
323
|
+
return instructions;
|
|
324
|
+
}
|
|
325
|
+
function validationOutputSample(validation) {
|
|
326
|
+
const maxEntries = 50;
|
|
327
|
+
const invalidRows = validation.filter((entry) => !entry.valid);
|
|
328
|
+
const sampleSource = invalidRows.length > 0 ? invalidRows : validation;
|
|
329
|
+
return {
|
|
330
|
+
validation: sampleSource.slice(0, maxEntries),
|
|
331
|
+
validationTruncated: Math.max(0, sampleSource.length - maxEntries),
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, schema, datasetId }) {
|
|
335
|
+
const validation = [];
|
|
336
|
+
let validRowCount = 0;
|
|
337
|
+
let rowRecordCount = 0;
|
|
338
|
+
console.log(`[Dataset ${datasetId}] Reading and validating JSONL file from sandbox`);
|
|
339
|
+
const fileRead = await readDatasetSandboxTextFileStep({ runtime, sandboxId, path: outputPath });
|
|
340
|
+
if (!fileRead.content) {
|
|
341
|
+
console.log(`[Dataset ${datasetId}] Empty output file`);
|
|
342
|
+
return {
|
|
343
|
+
success: false,
|
|
344
|
+
status: "empty_output",
|
|
345
|
+
validation,
|
|
346
|
+
validRowCount: 0,
|
|
347
|
+
rowRecordCount: 0,
|
|
348
|
+
error: "output.jsonl is empty",
|
|
349
|
+
message: "output.jsonl is empty",
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
const lines = fileRead.content.split("\n");
|
|
353
|
+
console.log(`[Dataset ${datasetId}] Validating ${lines.length} lines`);
|
|
354
|
+
for (let index = 0; index < lines.length; index++) {
|
|
355
|
+
const line = lines[index];
|
|
356
|
+
const trimmed = line.trim();
|
|
357
|
+
if (trimmed.length === 0) {
|
|
358
|
+
continue;
|
|
359
|
+
}
|
|
360
|
+
let record;
|
|
361
|
+
try {
|
|
362
|
+
record = JSON.parse(trimmed);
|
|
363
|
+
}
|
|
364
|
+
catch (error) {
|
|
365
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
366
|
+
validation.push({
|
|
367
|
+
index,
|
|
368
|
+
valid: false,
|
|
369
|
+
errors: [`Invalid JSON: ${message}`],
|
|
370
|
+
});
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (record.type !== "row") {
|
|
374
|
+
validation.push({
|
|
375
|
+
index,
|
|
376
|
+
valid: false,
|
|
377
|
+
errors: ["Every non-empty output line must be a JSON object with type 'row'"],
|
|
378
|
+
});
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
rowRecordCount++;
|
|
382
|
+
const data = record.data;
|
|
383
|
+
if (data === undefined || data === null) {
|
|
384
|
+
validation.push({
|
|
385
|
+
index,
|
|
386
|
+
valid: false,
|
|
387
|
+
errors: ["Missing 'data' field"],
|
|
388
|
+
});
|
|
389
|
+
continue;
|
|
390
|
+
}
|
|
391
|
+
const valid = validator(data);
|
|
392
|
+
if (!valid) {
|
|
393
|
+
const errorDetails = toErrorDetails(validator.errors);
|
|
394
|
+
const errors = errorDetails.length > 0
|
|
395
|
+
? errorDetails.map((err) => err.message || "Unknown validation error")
|
|
396
|
+
: ["Unknown validation error"];
|
|
397
|
+
validation.push({
|
|
398
|
+
index,
|
|
399
|
+
valid: false,
|
|
400
|
+
errors,
|
|
401
|
+
errorDetails,
|
|
402
|
+
dataKeys: data && typeof data === "object" && !Array.isArray(data) ? Object.keys(data) : [],
|
|
403
|
+
});
|
|
404
|
+
continue;
|
|
405
|
+
}
|
|
406
|
+
validation.push({
|
|
407
|
+
index,
|
|
408
|
+
valid: true,
|
|
409
|
+
});
|
|
410
|
+
validRowCount++;
|
|
411
|
+
}
|
|
412
|
+
console.log(`[Dataset ${datasetId}] Validation completed: ${validRowCount} valid rows`);
|
|
413
|
+
const invalidRows = validation.filter((entry) => !entry.valid);
|
|
414
|
+
if (rowRecordCount === 0 || validRowCount === 0 || invalidRows.length > 0) {
|
|
415
|
+
const failureSummary = buildValidationFailureSummary({
|
|
416
|
+
schema,
|
|
417
|
+
validation,
|
|
418
|
+
rowRecordCount,
|
|
419
|
+
validRowCount,
|
|
420
|
+
});
|
|
421
|
+
const repairInstructions = buildRepairInstructions(failureSummary);
|
|
422
|
+
const sampled = validationOutputSample(validation);
|
|
423
|
+
const message = rowRecordCount === 0
|
|
424
|
+
? "output.jsonl does not contain any type='row' records"
|
|
425
|
+
: validRowCount === 0
|
|
426
|
+
? "No dataset rows matched the stored schema"
|
|
427
|
+
: `${invalidRows.length} dataset row(s) failed schema validation`;
|
|
428
|
+
console.error(`[Dataset ${datasetId}] Validation failed: ${message}`);
|
|
429
|
+
return {
|
|
430
|
+
success: false,
|
|
431
|
+
status: "validation_failed",
|
|
432
|
+
validation: sampled.validation,
|
|
433
|
+
validationTruncated: sampled.validationTruncated,
|
|
434
|
+
failureSummary,
|
|
435
|
+
repairInstructions,
|
|
436
|
+
validRowCount,
|
|
437
|
+
rowRecordCount,
|
|
438
|
+
error: message,
|
|
439
|
+
message: `${message}. Repair output.jsonl using repairInstructions and failureSummary.`,
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
return {
|
|
443
|
+
success: true,
|
|
444
|
+
status: "completed",
|
|
445
|
+
validation,
|
|
446
|
+
validRowCount,
|
|
447
|
+
rowRecordCount,
|
|
448
|
+
};
|
|
449
|
+
}
|
|
@@ -7,12 +7,56 @@ export declare function createCompleteDatasetTool({ datasetId, sandboxId, runtim
|
|
|
7
7
|
summary: string;
|
|
8
8
|
}, {
|
|
9
9
|
success: boolean;
|
|
10
|
-
validation?:
|
|
10
|
+
validation?: {
|
|
11
11
|
index: number;
|
|
12
12
|
valid: boolean;
|
|
13
13
|
errors?: string[];
|
|
14
|
+
errorDetails?: Array<{
|
|
15
|
+
path: string;
|
|
16
|
+
keyword: string;
|
|
17
|
+
message: string;
|
|
18
|
+
params?: Record<string, unknown>;
|
|
19
|
+
schemaPath?: string;
|
|
20
|
+
}>;
|
|
14
21
|
dataKeys?: string[];
|
|
15
|
-
}
|
|
22
|
+
}[];
|
|
23
|
+
validationTruncated?: number;
|
|
24
|
+
failureSummary?: {
|
|
25
|
+
rowRecordCount: number;
|
|
26
|
+
validRowCount: number;
|
|
27
|
+
invalidRowCount: number;
|
|
28
|
+
expectedTopLevelKeys: string[];
|
|
29
|
+
requiredTopLevelKeys: string[];
|
|
30
|
+
requiredPaths: string[];
|
|
31
|
+
enumConstraints: Array<{
|
|
32
|
+
path: string;
|
|
33
|
+
values: unknown[];
|
|
34
|
+
}>;
|
|
35
|
+
topErrors: Array<{
|
|
36
|
+
message: string;
|
|
37
|
+
count: number;
|
|
38
|
+
}>;
|
|
39
|
+
missingRequiredProperties: Array<{
|
|
40
|
+
property: string;
|
|
41
|
+
count: number;
|
|
42
|
+
}>;
|
|
43
|
+
additionalProperties: Array<{
|
|
44
|
+
property: string;
|
|
45
|
+
count: number;
|
|
46
|
+
}>;
|
|
47
|
+
enumFailures: Array<{
|
|
48
|
+
path: string;
|
|
49
|
+
allowedValues: unknown[];
|
|
50
|
+
count: number;
|
|
51
|
+
}>;
|
|
52
|
+
observedTopLevelKeys: string[];
|
|
53
|
+
sampleInvalidRows: Array<{
|
|
54
|
+
index: number;
|
|
55
|
+
dataKeys?: string[];
|
|
56
|
+
errors?: string[];
|
|
57
|
+
}>;
|
|
58
|
+
};
|
|
59
|
+
repairInstructions?: string[];
|
|
16
60
|
validRowCount?: number;
|
|
17
61
|
rowRecordCount?: number;
|
|
18
62
|
error?: string;
|
|
@@ -27,6 +71,13 @@ export declare function createCompleteDatasetTool({ datasetId, sandboxId, runtim
|
|
|
27
71
|
index: number;
|
|
28
72
|
valid: boolean;
|
|
29
73
|
errors?: string[];
|
|
74
|
+
errorDetails?: Array<{
|
|
75
|
+
path: string;
|
|
76
|
+
keyword: string;
|
|
77
|
+
message: string;
|
|
78
|
+
params?: Record<string, unknown>;
|
|
79
|
+
schemaPath?: string;
|
|
80
|
+
}>;
|
|
30
81
|
dataKeys?: string[];
|
|
31
82
|
}[] | undefined;
|
|
32
83
|
error: string;
|