@ekairos/dataset 1.22.92-beta.development.0 → 1.22.94-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import { DatasetService } from "../service.js";
2
2
  import { datasetDomain } from "../schema.js";
3
+ import { verifyDatasetNotation } from "../notation.js";
3
4
  import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
4
5
  import { inferDatasetSchema, validateRows } from "./schemaInference.js";
5
6
  import { rowsToJsonl } from "./rows.js";
@@ -77,6 +78,21 @@ export async function materializeRowsToDataset(runtime, params) {
77
78
  if (!statusResult.ok) {
78
79
  throw new Error(statusResult.error);
79
80
  }
81
+ // verify the latest formal notation (if any was proposed) against the
82
+ // materialized rows — informative only, never blocks the build
83
+ try {
84
+ const existing = await service.getDatasetById(params.datasetId);
85
+ const notation = (existing.ok ? existing.data?.notation : null);
86
+ if (notation && Array.isArray(notation.predicates) && notation.predicates.length > 0) {
87
+ await service.updateDatasetNotation({
88
+ datasetId: params.datasetId,
89
+ notation: verifyDatasetNotation(notation, params.rows),
90
+ });
91
+ }
92
+ }
93
+ catch {
94
+ // notation verification must never affect the build result
95
+ }
80
96
  return params.datasetId;
81
97
  }
82
98
  export async function uploadInlineTextResource(runtime, datasetId, resource) {
@@ -1,5 +1,6 @@
1
1
  import Ajv from "ajv";
2
2
  import { getDatasetOutputPath } from "./datasetFiles.js";
3
+ import { verifyDatasetNotation } from "./notation.js";
3
4
  import { DatasetService } from "./service.js";
4
5
  import { getDatasetRuntimeDb } from "./dataset/steps.js";
5
6
  import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, } from "./sandbox/steps.js";
@@ -176,6 +177,19 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
176
177
  }
177
178
  console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
178
179
  console.log(`[Dataset ${datasetId}] ========================================`);
180
+ // Formal-notation verification: arithmetic checks of the latest notation
181
+ // against the produced rows. Informative only — a failure here never
182
+ // affects the dataset completion result.
183
+ try {
184
+ await verifyNotationAgainstJsonl({
185
+ service,
186
+ datasetId,
187
+ jsonlBase64: fileRead.contentBase64,
188
+ });
189
+ }
190
+ catch (error) {
191
+ console.error(`[Dataset ${datasetId}] notation verification skipped:`, error instanceof Error ? error.message : String(error));
192
+ }
179
193
  return {
180
194
  success: true,
181
195
  status: "completed",
@@ -187,6 +201,40 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
187
201
  dataFileId: uploadResult.data.fileId,
188
202
  };
189
203
  }
204
+ const NOTATION_VERIFY_MAX_ROWS = 50000;
205
+ async function verifyNotationAgainstJsonl(params) {
206
+ const existing = await params.service.getDatasetById(params.datasetId);
207
+ const notation = (existing.ok ? existing.data?.notation : null);
208
+ if (!notation || !Array.isArray(notation.predicates) || notation.predicates.length === 0) {
209
+ return;
210
+ }
211
+ const rows = [];
212
+ const content = Buffer.from(params.jsonlBase64, "base64").toString("utf-8");
213
+ for (const line of content.split("\n")) {
214
+ const trimmed = line.trim();
215
+ if (!trimmed)
216
+ continue;
217
+ try {
218
+ const parsed = JSON.parse(trimmed);
219
+ if (parsed && parsed.type === "row") {
220
+ rows.push(parsed.data);
221
+ }
222
+ }
223
+ catch {
224
+ // malformed lines were already handled by schema validation
225
+ }
226
+ if (rows.length >= NOTATION_VERIFY_MAX_ROWS)
227
+ break;
228
+ }
229
+ const verified = verifyDatasetNotation(notation, rows);
230
+ await params.service.updateDatasetNotation({
231
+ datasetId: params.datasetId,
232
+ notation: verified,
233
+ });
234
+ const failed = (verified.checks ?? []).filter((check) => check.status === "failed");
235
+ console.log(`[Dataset ${params.datasetId}] notation v${verified.version} ${verified.status}` +
236
+ (failed.length ? ` (${failed.length} predicados violados)` : ""));
237
+ }
190
238
  function resolveExecutionStoragePath(outputPath, datasetId) {
191
239
  const normalized = String(outputPath ?? "").replace(/\\/g, "/");
192
240
  const marker = "/tmp/ekairos/contexts/";
@@ -3,6 +3,7 @@ import { createClearDatasetTool } from "../clearDataset.tool.js";
3
3
  import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
4
4
  import { datasetGetByIdStep } from "../dataset/steps.js";
5
5
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
+ import { createProposeNotationTool } from "../proposeNotation.tool.js";
6
7
  import { createGenerateSchemaTool } from "./generateSchema.tool.js";
7
8
  import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
8
9
  import { createDatasetId } from "../id.js";
@@ -151,6 +152,10 @@ function createFileParseContextDefinition(params) {
151
152
  sandboxId,
152
153
  runtime,
153
154
  }),
155
+ proposeNotation: createProposeNotationTool({
156
+ datasetId,
157
+ runtime,
158
+ }),
154
159
  };
155
160
  if (!existingSchema) {
156
161
  actions.generateSchema = createGenerateSchemaTool({
@@ -238,9 +238,21 @@ function buildInstructions(context) {
238
238
  .ele("Action").txt("Review the FilePreview section in Context to understand the file structure").up()
239
239
  .ele("Note").txt("FilePreview contains: TotalRows (total data rows), Metadata (file properties with JSON output), Head (first N raw file lines), Tail (last N lines if present), Mid (middle sample for large files). Each section shows Description, Script (full Python code), Command, Stdout (raw content), Stderr. This allows you to understand the exact file format.").up()
240
240
  .up();
241
+ xml = xml
242
+ .ele("Step", { number: "2", name: "Propose Formal Notation (PLAN FIRST)" })
243
+ .ele("Action").txt("Call proposeNotation with the INITIAL formal definition of the dataset as a set, derived from the file preview: D = { r | r ∈ File ∧ <constraints> } in LaTeX, the symbols it binds (sets, variables, functions) and the predicates every row will satisfy").up()
244
+ .ele("Requirements")
245
+ .ele("Requirement").txt("The notation is your PLANNING artifact: it comes BEFORE the schema and BEFORE any parsing code. The LaTeX that explains the dataset matters more than the code that produces it").up()
246
+ .ele("Requirement").txt("Use set-builder notation, quantifiers and arithmetic in LaTeX (e.g. D = \\{(c, q, p) \\mid q \\in \\mathbb{Z}^{+},\\; p \\in \\mathbb{R}_{\\geq 0}\\})").up()
247
+ .ele("Requirement").txt("Declare every discovered set and variable as a symbol with a one-line meaning").up()
248
+ .ele("Requirement").txt("Give predicates a machine-checkable checkJson whenever the claim is arithmetic (row counts, field types, ranges, uniqueness, aggregates); leave semantic-only claims without checkJson").up()
249
+ .ele("Requirement").txt("ITERATE: every time the analysis discovers a new set, variable, constraint or correction (new columns, unexpected types, excluded sections), call proposeNotation again with the refined notation and the reason. The notation is not definitive — discovery is the point").up()
250
+ .ele("Requirement").txt("Before calling completeDataset, call proposeNotation one last time with final=true so the notation describes EXACTLY the dataset you produced; its checkable predicates will be verified arithmetically against the rows").up()
251
+ .up()
252
+ .up();
241
253
  if (hasProvidedSchema) {
242
254
  xml = xml
243
- .ele("Step", { number: "2", name: "Use Provided Schema" })
255
+ .ele("Step", { number: "3", name: "Use Provided Schema" })
244
256
  .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
245
257
  .ele("Requirements")
246
258
  .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
@@ -255,7 +267,7 @@ function buildInstructions(context) {
255
267
  }
256
268
  else {
257
269
  xml = xml
258
- .ele("Step", { number: "2", name: "Generate JSON Schema" })
270
+ .ele("Step", { number: "3", name: "Generate JSON Schema" })
259
271
  .ele("Action").txt("Call generateSchema to create a JSON Schema for a SINGLE DATA RECORD (one row of data)").up()
260
272
  .ele("Requirements")
261
273
  .ele("Requirement").txt("Schema describes ONE DATA RECORD structure only (type: object, not array)").up()
@@ -267,7 +279,7 @@ function buildInstructions(context) {
267
279
  .up();
268
280
  }
269
281
  xml = xml
270
- .ele("Step", { number: "3", name: "Generate Dataset JSONL" })
282
+ .ele("Step", { number: "4", name: "Generate Dataset JSONL" })
271
283
  .ele("Action").txt(`Use executeCommand to parse the file and generate output.jsonl in the dataset workstation`).up()
272
284
  .ele("Requirements")
273
285
  .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
@@ -279,12 +291,13 @@ function buildInstructions(context) {
279
291
  .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
280
292
  .up()
281
293
  .up()
282
- .ele("Step", { number: "4", name: "Complete and Validate" })
283
- .ele("Action").txt("Call completeDataset to validate the dataset").up()
294
+ .ele("Step", { number: "5", name: "Complete and Validate" })
295
+ .ele("Action").txt("Call proposeNotation with final=true (refined to match the produced rows), then call completeDataset to validate the dataset").up()
284
296
  .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
285
297
  .up()
286
298
  .up()
287
299
  .ele("Rules")
300
+ .ele("Rule").txt("The formal notation (proposeNotation) is the planning artifact: propose it first, iterate it on every discovery, finalize it before completion. The LaTeX explains the dataset; the code merely produces it").up()
288
301
  .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
289
302
  .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
290
303
  .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
package/dist/index.d.ts CHANGED
@@ -2,6 +2,7 @@ export * from "./dataset.js";
2
2
  export * from "./contextWorkspace.js";
3
3
  export * from "./domain.js";
4
4
  export * from "./materializeDataset.tool.js";
5
+ export * from "./notation.js";
5
6
  export * from "./schema.js";
6
7
  export * from "./service.js";
7
8
  export { registerFileParseContext } from "./file/file-dataset.agent.js";
package/dist/index.js CHANGED
@@ -2,6 +2,7 @@ export * from "./dataset.js";
2
2
  export * from "./contextWorkspace.js";
3
3
  export * from "./domain.js";
4
4
  export * from "./materializeDataset.tool.js";
5
+ export * from "./notation.js";
5
6
  export * from "./schema.js";
6
7
  export * from "./service.js";
7
8
  export { registerFileParseContext } from "./file/file-dataset.agent.js";
@@ -0,0 +1,163 @@
1
+ /**
2
+ * Formal notation for datasets.
3
+ *
4
+ * A dataset is the materialization of a set defined by formal notation:
5
+ * LaTeX (set-builder, relational algebra, quantified predicates) that
6
+ * EXPLAINS the data — what sets it draws from, what variables it binds,
7
+ * what constraints every row satisfies. The notation is the planning
8
+ * artifact: it starts as a proposal from the first look at the resources
9
+ * and is ITERATED as the analysis discovers new sets, variables and
10
+ * invariants. The final notation describes the produced dataset and its
11
+ * machine-checkable predicates are verified with plain arithmetic over
12
+ * the actual rows (propositional combinations supported).
13
+ *
14
+ * Verification is informative, never blocking: a dataset completes the
15
+ * same way it always did; the notation carries its own verified/violated
16
+ * state alongside.
17
+ */
18
+ export type DatasetNotationSymbolKind = "set" | "variable" | "function" | "constant" | "predicate";
19
+ export type DatasetNotationSymbol = {
20
+ /** plain identifier, e.g. "D", "w", "Orders" */
21
+ name: string;
22
+ /** LaTeX for the symbol, e.g. "\\mathcal{D}" (defaults to the name) */
23
+ latex?: string;
24
+ kind: DatasetNotationSymbolKind;
25
+ description: string;
26
+ };
27
+ export type NotationCmpOp = "=" | "!=" | "<" | "<=" | ">" | ">=";
28
+ /**
29
+ * Machine-checkable claims about the dataset, evaluated with plain
30
+ * arithmetic over the rows. Field access supports dot-paths into nested
31
+ * records ("company.taxId"). Leaf checks are dataset-level propositions;
32
+ * and/or/not/implies compose them propositionally.
33
+ */
34
+ export type NotationCheck = {
35
+ kind: "row_count";
36
+ op: NotationCmpOp;
37
+ value: number;
38
+ } | {
39
+ kind: "field_type";
40
+ field: string;
41
+ type: "number" | "integer" | "string" | "boolean";
42
+ allowNull?: boolean;
43
+ } | {
44
+ kind: "field_range";
45
+ field: string;
46
+ min?: number;
47
+ max?: number;
48
+ } | {
49
+ kind: "field_in";
50
+ field: string;
51
+ values: Array<string | number | boolean>;
52
+ } | {
53
+ kind: "field_nonnull";
54
+ field: string;
55
+ } | {
56
+ kind: "field_matches";
57
+ field: string;
58
+ pattern: string;
59
+ } | {
60
+ kind: "unique";
61
+ fields: string[];
62
+ } | {
63
+ kind: "aggregate";
64
+ fn: "sum" | "count" | "min" | "max" | "avg";
65
+ /** omit for fn = "count" */
66
+ field?: string;
67
+ op: NotationCmpOp;
68
+ value: number;
69
+ /** absolute tolerance for float comparison (default 1e-9) */
70
+ tolerance?: number;
71
+ } | {
72
+ kind: "and";
73
+ checks: NotationCheck[];
74
+ } | {
75
+ kind: "or";
76
+ checks: NotationCheck[];
77
+ } | {
78
+ kind: "not";
79
+ check: NotationCheck;
80
+ } | {
81
+ kind: "implies";
82
+ if: NotationCheck;
83
+ then: NotationCheck;
84
+ };
85
+ export type DatasetNotationPredicate = {
86
+ /** stable id within the notation, e.g. "p1", "rowCount" */
87
+ id: string;
88
+ description: string;
89
+ /** the claim in LaTeX, e.g. "\\forall r \\in D:\\; r.amount > 0" */
90
+ latex: string;
91
+ /** machine-checkable form; absent = semantic-only claim (not verified) */
92
+ check?: NotationCheck;
93
+ };
94
+ export type DatasetNotationCheckResult = {
95
+ predicateId: string;
96
+ status: "passed" | "failed" | "skipped";
97
+ detail?: string;
98
+ };
99
+ export type DatasetNotationRevision = {
100
+ version: number;
101
+ latex: string;
102
+ /** why this revision happened — the discovery that triggered it */
103
+ reason: string;
104
+ at: number;
105
+ };
106
+ export type DatasetNotationStatus = "proposed" | "refined" | "final" | "verified" | "violated";
107
+ export type DatasetNotation = {
108
+ version: number;
109
+ status: DatasetNotationStatus;
110
+ /** the main definition: the dataset as a set, in LaTeX */
111
+ latex: string;
112
+ symbols: DatasetNotationSymbol[];
113
+ predicates: DatasetNotationPredicate[];
114
+ checks?: DatasetNotationCheckResult[];
115
+ verifiedAt?: number;
116
+ history: DatasetNotationRevision[];
117
+ };
118
+ export type NotationRevisionInput = {
119
+ latex: string;
120
+ symbols?: DatasetNotationSymbol[];
121
+ predicates?: DatasetNotationPredicate[];
122
+ reason: string;
123
+ /** "final" marks the notation as describing the produced dataset */
124
+ final?: boolean;
125
+ };
126
+ /**
127
+ * Iterate the notation: every revision bumps the version and appends to
128
+ * history, so the discovery trail (sets/variables found along the way)
129
+ * stays visible.
130
+ */
131
+ export declare function reviseDatasetNotation(previous: DatasetNotation | null | undefined, input: NotationRevisionInput): DatasetNotation;
132
+ /** escape an identifier for use inside \text{} */
133
+ export declare function latexIdentifier(name: string): string;
134
+ type JsonSchemaLike = {
135
+ title?: string;
136
+ schema?: Record<string, any>;
137
+ properties?: Record<string, any>;
138
+ };
139
+ /**
140
+ * A query-backed dataset has a complete deterministic description: the
141
+ * dataset is the image of a known query over a known domain. No model is
142
+ * involved — the notation and its checkable predicates derive mechanically
143
+ * from the query, the inferred schema and the produced row count.
144
+ */
145
+ export declare function inferQueryNotation(params: {
146
+ entityNames: string[];
147
+ rowCount: number;
148
+ schema?: JsonSchemaLike | null;
149
+ explanation?: string;
150
+ }): DatasetNotation;
151
+ type CheckOutcome = {
152
+ ok: boolean;
153
+ detail: string;
154
+ };
155
+ export declare function evaluateNotationCheck(rows: any[], check: NotationCheck): CheckOutcome;
156
+ /**
157
+ * Verify a notation against produced rows. Pure arithmetic — never throws.
158
+ * Predicates without a machine-checkable form are reported as "skipped"
159
+ * (they remain semantic claims). Returns the notation with check results
160
+ * and a verified/violated status.
161
+ */
162
+ export declare function verifyDatasetNotation(notation: DatasetNotation, rows: any[]): DatasetNotation;
163
+ export {};
@@ -0,0 +1,408 @@
1
+ /**
2
+ * Formal notation for datasets.
3
+ *
4
+ * A dataset is the materialization of a set defined by formal notation:
5
+ * LaTeX (set-builder, relational algebra, quantified predicates) that
6
+ * EXPLAINS the data — what sets it draws from, what variables it binds,
7
+ * what constraints every row satisfies. The notation is the planning
8
+ * artifact: it starts as a proposal from the first look at the resources
9
+ * and is ITERATED as the analysis discovers new sets, variables and
10
+ * invariants. The final notation describes the produced dataset and its
11
+ * machine-checkable predicates are verified with plain arithmetic over
12
+ * the actual rows (propositional combinations supported).
13
+ *
14
+ * Verification is informative, never blocking: a dataset completes the
15
+ * same way it always did; the notation carries its own verified/violated
16
+ * state alongside.
17
+ */
18
+ /**
19
+ * Iterate the notation: every revision bumps the version and appends to
20
+ * history, so the discovery trail (sets/variables found along the way)
21
+ * stays visible.
22
+ */
23
+ export function reviseDatasetNotation(previous, input) {
24
+ const version = (previous?.version ?? 0) + 1;
25
+ const revision = {
26
+ version,
27
+ latex: input.latex,
28
+ reason: input.reason,
29
+ at: Date.now(),
30
+ };
31
+ return {
32
+ version,
33
+ status: input.final ? "final" : previous ? "refined" : "proposed",
34
+ latex: input.latex,
35
+ symbols: input.symbols ?? previous?.symbols ?? [],
36
+ predicates: input.predicates ?? previous?.predicates ?? [],
37
+ history: [...(previous?.history ?? []), revision],
38
+ };
39
+ }
40
+ /* ── LaTeX helpers ──────────────────────────────────────────────── */
41
+ /** escape an identifier for use inside \text{} */
42
+ export function latexIdentifier(name) {
43
+ return `\\text{${String(name).replace(/([#$%&_{}])/g, "\\$1")}}`;
44
+ }
45
+ function latexFieldType(type) {
46
+ if (type === "number")
47
+ return "\\mathbb{R}";
48
+ if (type === "integer")
49
+ return "\\mathbb{Z}";
50
+ if (type === "boolean")
51
+ return "\\{\\top,\\bot\\}";
52
+ return "\\Sigma^{*}";
53
+ }
54
+ const JSON_SCHEMA_KEYWORDS = new Set([
55
+ "type",
56
+ "title",
57
+ "description",
58
+ "required",
59
+ "items",
60
+ "additionalProperties",
61
+ ]);
62
+ function schemaProperties(schema) {
63
+ const root = (schema?.schema ?? schema ?? {});
64
+ if (root.properties && typeof root.properties === "object") {
65
+ return root.properties;
66
+ }
67
+ // flat shape from query inference: { fieldName: "type", ... }
68
+ const flat = {};
69
+ for (const [key, value] of Object.entries(root)) {
70
+ if (JSON_SCHEMA_KEYWORDS.has(key))
71
+ continue;
72
+ if (typeof value === "string")
73
+ flat[key] = value;
74
+ else if (value && typeof value === "object" && typeof value.type === "string") {
75
+ flat[key] = value;
76
+ }
77
+ }
78
+ return flat;
79
+ }
80
+ /**
81
+ * A query-backed dataset has a complete deterministic description: the
82
+ * dataset is the image of a known query over a known domain. No model is
83
+ * involved — the notation and its checkable predicates derive mechanically
84
+ * from the query, the inferred schema and the produced row count.
85
+ */
86
+ export function inferQueryNotation(params) {
87
+ const sources = params.entityNames.length ? params.entityNames : ["Domain"];
88
+ const sourceSymbols = sources.map((name) => ({
89
+ name,
90
+ latex: latexIdentifier(name),
91
+ kind: "set",
92
+ description: `Entidad de origen ${name}`,
93
+ }));
94
+ const union = sources.map((name) => latexIdentifier(name)).join(" \\cup ");
95
+ const latex = `\\mathcal{D} = \\left\\{\\, r \\;\\middle|\\; r \\in Q\\!\\left(${union}\\right) \\right\\}`;
96
+ const properties = schemaProperties(params.schema);
97
+ const predicates = [
98
+ {
99
+ id: "cardinality",
100
+ description: `El dataset tiene exactamente ${params.rowCount} filas`,
101
+ latex: `|\\mathcal{D}| = ${params.rowCount}`,
102
+ check: { kind: "row_count", op: "=", value: params.rowCount },
103
+ },
104
+ ];
105
+ for (const [field, raw] of Object.entries(properties)) {
106
+ const type = typeof raw === "string" ? raw : String(raw?.type ?? "");
107
+ if (!["number", "integer", "boolean", "string"].includes(type))
108
+ continue;
109
+ predicates.push({
110
+ id: `type_${field}`,
111
+ description: `Toda fila tiene ${field} de tipo ${type} (o nulo)`,
112
+ latex: `\\forall r \\in \\mathcal{D}:\\; r.${latexIdentifier(field)} \\in ${latexFieldType(type)} \\cup \\{\\varnothing\\}`,
113
+ check: {
114
+ kind: "field_type",
115
+ field,
116
+ type: type,
117
+ allowNull: true,
118
+ },
119
+ });
120
+ }
121
+ return reviseDatasetNotation(null, {
122
+ latex,
123
+ symbols: [
124
+ {
125
+ name: "D",
126
+ latex: "\\mathcal{D}",
127
+ kind: "set",
128
+ description: params.explanation?.trim() || "Dataset materializado",
129
+ },
130
+ {
131
+ name: "Q",
132
+ kind: "function",
133
+ description: "Consulta InstaQL aplicada al dominio",
134
+ },
135
+ ...sourceSymbols,
136
+ ],
137
+ predicates,
138
+ reason: "Notación determinística derivada de la consulta al dominio",
139
+ final: true,
140
+ });
141
+ }
142
+ /* ── arithmetic evaluation ──────────────────────────────────────── */
143
+ function readPath(row, path) {
144
+ let current = row;
145
+ for (const segment of String(path).split(".")) {
146
+ if (current === null || current === undefined)
147
+ return undefined;
148
+ current = current[segment];
149
+ }
150
+ return current;
151
+ }
152
+ function compare(op, left, right, tolerance = 0) {
153
+ switch (op) {
154
+ case "=":
155
+ return Math.abs(left - right) <= tolerance;
156
+ case "!=":
157
+ return Math.abs(left - right) > tolerance;
158
+ case "<":
159
+ return left < right;
160
+ case "<=":
161
+ return left <= right + tolerance;
162
+ case ">":
163
+ return left > right;
164
+ case ">=":
165
+ return left >= right - tolerance;
166
+ }
167
+ }
168
+ export function evaluateNotationCheck(rows, check) {
169
+ switch (check.kind) {
170
+ case "row_count": {
171
+ const ok = compare(check.op, rows.length, check.value);
172
+ return { ok, detail: `|D| = ${rows.length} ${check.op} ${check.value}` };
173
+ }
174
+ case "field_type": {
175
+ let failures = 0;
176
+ let firstFailure = "";
177
+ for (const row of rows) {
178
+ const value = readPath(row, check.field);
179
+ if (value === null || value === undefined) {
180
+ if (check.allowNull)
181
+ continue;
182
+ failures += 1;
183
+ if (!firstFailure)
184
+ firstFailure = "null";
185
+ continue;
186
+ }
187
+ const okValue = check.type === "number"
188
+ ? typeof value === "number" && Number.isFinite(value)
189
+ : check.type === "integer"
190
+ ? typeof value === "number" && Number.isInteger(value)
191
+ : check.type === "boolean"
192
+ ? typeof value === "boolean"
193
+ : typeof value === "string";
194
+ if (!okValue) {
195
+ failures += 1;
196
+ if (!firstFailure)
197
+ firstFailure = JSON.stringify(value)?.slice(0, 40) ?? "?";
198
+ }
199
+ }
200
+ return {
201
+ ok: failures === 0,
202
+ detail: failures === 0
203
+ ? `∀r: ${check.field} : ${check.type}`
204
+ : `${failures}/${rows.length} filas violan ${check.field} : ${check.type} (ej: ${firstFailure})`,
205
+ };
206
+ }
207
+ case "field_range": {
208
+ let failures = 0;
209
+ for (const row of rows) {
210
+ const value = readPath(row, check.field);
211
+ if (typeof value !== "number" || Number.isNaN(value))
212
+ continue;
213
+ if (check.min !== undefined && value < check.min)
214
+ failures += 1;
215
+ else if (check.max !== undefined && value > check.max)
216
+ failures += 1;
217
+ }
218
+ const bounds = [
219
+ check.min !== undefined ? `≥ ${check.min}` : "",
220
+ check.max !== undefined ? `≤ ${check.max}` : "",
221
+ ]
222
+ .filter(Boolean)
223
+ .join(" ∧ ");
224
+ return {
225
+ ok: failures === 0,
226
+ detail: failures === 0
227
+ ? `∀r: ${check.field} ${bounds}`
228
+ : `${failures}/${rows.length} filas fuera de rango en ${check.field}`,
229
+ };
230
+ }
231
+ case "field_in": {
232
+ const allowed = new Set(check.values.map((value) => JSON.stringify(value)));
233
+ let failures = 0;
234
+ for (const row of rows) {
235
+ const value = readPath(row, check.field);
236
+ if (value === null || value === undefined)
237
+ continue;
238
+ if (!allowed.has(JSON.stringify(value)))
239
+ failures += 1;
240
+ }
241
+ return {
242
+ ok: failures === 0,
243
+ detail: failures === 0
244
+ ? `∀r: ${check.field} ∈ {${check.values.join(", ")}}`
245
+ : `${failures}/${rows.length} filas con ${check.field} fuera del conjunto`,
246
+ };
247
+ }
248
+ case "field_nonnull": {
249
+ let failures = 0;
250
+ for (const row of rows) {
251
+ const value = readPath(row, check.field);
252
+ if (value === null || value === undefined || value === "")
253
+ failures += 1;
254
+ }
255
+ return {
256
+ ok: failures === 0,
257
+ detail: failures === 0
258
+ ? `∀r: ${check.field} ≠ ∅`
259
+ : `${failures}/${rows.length} filas con ${check.field} vacío`,
260
+ };
261
+ }
262
+ case "field_matches": {
263
+ let regex;
264
+ try {
265
+ regex = new RegExp(check.pattern);
266
+ }
267
+ catch {
268
+ return { ok: false, detail: `patrón inválido: ${check.pattern}` };
269
+ }
270
+ let failures = 0;
271
+ for (const row of rows) {
272
+ const value = readPath(row, check.field);
273
+ if (typeof value !== "string")
274
+ continue;
275
+ if (!regex.test(value))
276
+ failures += 1;
277
+ }
278
+ return {
279
+ ok: failures === 0,
280
+ detail: failures === 0
281
+ ? `∀r: ${check.field} ~ /${check.pattern}/`
282
+ : `${failures}/${rows.length} filas no matchean /${check.pattern}/`,
283
+ };
284
+ }
285
+ case "unique": {
286
+ const seen = new Set();
287
+ let duplicates = 0;
288
+ for (const row of rows) {
289
+ const key = JSON.stringify(check.fields.map((field) => readPath(row, field)));
290
+ if (seen.has(key))
291
+ duplicates += 1;
292
+ else
293
+ seen.add(key);
294
+ }
295
+ return {
296
+ ok: duplicates === 0,
297
+ detail: duplicates === 0
298
+ ? `(${check.fields.join(", ")}) es clave`
299
+ : `${duplicates} duplicados sobre (${check.fields.join(", ")})`,
300
+ };
301
+ }
302
+ case "aggregate": {
303
+ const values = [];
304
+ for (const row of rows) {
305
+ if (check.fn === "count" && !check.field)
306
+ continue;
307
+ const value = readPath(row, String(check.field));
308
+ if (typeof value === "number" && Number.isFinite(value))
309
+ values.push(value);
310
+ }
311
+ let actual;
312
+ switch (check.fn) {
313
+ case "count":
314
+ actual = check.field ? values.length : rows.length;
315
+ break;
316
+ case "sum":
317
+ actual = values.reduce((total, value) => total + value, 0);
318
+ break;
319
+ case "min":
320
+ actual = values.length ? Math.min(...values) : Number.NaN;
321
+ break;
322
+ case "max":
323
+ actual = values.length ? Math.max(...values) : Number.NaN;
324
+ break;
325
+ case "avg":
326
+ actual = values.length
327
+ ? values.reduce((total, value) => total + value, 0) / values.length
328
+ : Number.NaN;
329
+ break;
330
+ }
331
+ const tolerance = check.tolerance ?? 1e-9;
332
+ const ok = Number.isFinite(actual) && compare(check.op, actual, check.value, tolerance);
333
+ return {
334
+ ok,
335
+ detail: `${check.fn}(${check.field ?? "*"}) = ${Number.isFinite(actual) ? actual : "∅"} ${check.op} ${check.value}`,
336
+ };
337
+ }
338
+ case "and": {
339
+ const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
340
+ return {
341
+ ok: results.every((result) => result.ok),
342
+ detail: results.map((result) => result.detail).join(" ∧ "),
343
+ };
344
+ }
345
+ case "or": {
346
+ const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
347
+ return {
348
+ ok: results.some((result) => result.ok),
349
+ detail: results.map((result) => result.detail).join(" ∨ "),
350
+ };
351
+ }
352
+ case "not": {
353
+ const result = evaluateNotationCheck(rows, check.check);
354
+ return { ok: !result.ok, detail: `¬(${result.detail})` };
355
+ }
356
+ case "implies": {
357
+ const antecedent = evaluateNotationCheck(rows, check.if);
358
+ if (!antecedent.ok) {
359
+ return { ok: true, detail: `(${antecedent.detail}) → ⊤ (antecedente falso)` };
360
+ }
361
+ const consequent = evaluateNotationCheck(rows, check.then);
362
+ return {
363
+ ok: consequent.ok,
364
+ detail: `(${antecedent.detail}) → (${consequent.detail})`,
365
+ };
366
+ }
367
+ }
368
+ }
369
+ /**
370
+ * Verify a notation against produced rows. Pure arithmetic — never throws.
371
+ * Predicates without a machine-checkable form are reported as "skipped"
372
+ * (they remain semantic claims). Returns the notation with check results
373
+ * and a verified/violated status.
374
+ */
375
+ export function verifyDatasetNotation(notation, rows) {
376
+ const checks = [];
377
+ let failed = 0;
378
+ for (const predicate of notation.predicates ?? []) {
379
+ if (!predicate.check) {
380
+ checks.push({ predicateId: predicate.id, status: "skipped" });
381
+ continue;
382
+ }
383
+ try {
384
+ const outcome = evaluateNotationCheck(rows, predicate.check);
385
+ checks.push({
386
+ predicateId: predicate.id,
387
+ status: outcome.ok ? "passed" : "failed",
388
+ detail: outcome.detail,
389
+ });
390
+ if (!outcome.ok)
391
+ failed += 1;
392
+ }
393
+ catch (error) {
394
+ checks.push({
395
+ predicateId: predicate.id,
396
+ status: "failed",
397
+ detail: `error de evaluación: ${String(error).slice(0, 120)}`,
398
+ });
399
+ failed += 1;
400
+ }
401
+ }
402
+ return {
403
+ ...notation,
404
+ checks,
405
+ status: failed === 0 ? "verified" : "violated",
406
+ verifiedAt: Date.now(),
407
+ };
408
+ }
@@ -0,0 +1,42 @@
1
+ interface ProposeNotationToolParams {
2
+ datasetId: string;
3
+ runtime: any;
4
+ }
5
+ /**
6
+ * proposeNotation — declare or ITERATE the formal notation of the dataset.
7
+ *
8
+ * The notation is the planning artifact: call it FIRST with the initial
9
+ * set definition derived from the resources, and call it AGAIN whenever
10
+ * the analysis discovers new sets, variables, constraints or corrections.
11
+ * Every call appends a revision (the discovery trail is preserved). Mark
12
+ * the last call with final=true so the notation describes the produced
13
+ * dataset; its checkable predicates get verified arithmetically after
14
+ * completion.
15
+ */
16
+ export declare function createProposeNotationTool({ datasetId, runtime }: ProposeNotationToolParams): import("ai").Tool<{
17
+ latex: string;
18
+ symbols: {
19
+ name: string;
20
+ kind: "function" | "set" | "variable" | "constant" | "predicate";
21
+ description: string;
22
+ latex?: string | undefined;
23
+ }[];
24
+ predicates: {
25
+ id: string;
26
+ description: string;
27
+ latex: string;
28
+ checkJson?: string | undefined;
29
+ }[];
30
+ reason: string;
31
+ final?: boolean | undefined;
32
+ }, {
33
+ success: boolean;
34
+ error: string;
35
+ } | {
36
+ warning?: string | undefined;
37
+ success: boolean;
38
+ version: number;
39
+ status: import("./notation.js").DatasetNotationStatus;
40
+ error?: undefined;
41
+ }>;
42
+ export {};
@@ -0,0 +1,142 @@
1
+ import { tool } from "ai";
2
+ import { z } from "zod";
3
+ import { DatasetService } from "./service.js";
4
+ import { datasetDomain } from "./schema.js";
5
+ import { reviseDatasetNotation, } from "./notation.js";
6
+ const symbolSchema = z.object({
7
+ name: z.string().describe("Plain identifier, e.g. 'D', 'Orders', 'w'"),
8
+ latex: z
9
+ .string()
10
+ .optional()
11
+ .describe("LaTeX for the symbol, e.g. '\\\\mathcal{D}' (defaults to the name)"),
12
+ kind: z.enum(["set", "variable", "function", "constant", "predicate"]),
13
+ description: z.string().describe("What this symbol denotes in the data"),
14
+ });
15
+ const predicateSchema = z.object({
16
+ id: z.string().describe("Stable id, e.g. 'p1', 'cardinality'"),
17
+ description: z.string().describe("The claim in plain language"),
18
+ latex: z
19
+ .string()
20
+ .describe("The claim in LaTeX, e.g. '\\\\forall r \\\\in D: r.amount > 0'"),
21
+ checkJson: z
22
+ .string()
23
+ .optional()
24
+ .describe([
25
+ "OPTIONAL machine-checkable form of the claim as a JSON string, verified",
26
+ "with plain arithmetic over the produced rows. Shapes:",
27
+ '{"kind":"row_count","op":"=","value":124}',
28
+ '{"kind":"field_type","field":"amount","type":"number","allowNull":true}',
29
+ '{"kind":"field_range","field":"amount","min":0}',
30
+ '{"kind":"field_in","field":"status","values":["paid","void"]}',
31
+ '{"kind":"field_nonnull","field":"orderId"}',
32
+ '{"kind":"field_matches","field":"sku","pattern":"^[A-Z0-9-]+$"}',
33
+ '{"kind":"unique","fields":["orderId"]}',
34
+ '{"kind":"aggregate","fn":"sum","field":"amount","op":">=","value":0}',
35
+ 'Propositional composition: {"kind":"and"|"or","checks":[...]},',
36
+ '{"kind":"not","check":...}, {"kind":"implies","if":...,"then":...}.',
37
+ "Fields support dot-paths into nested records (company.taxId).",
38
+ "Omit for claims that are semantic only.",
39
+ ].join(" ")),
40
+ });
41
+ async function getDatasetService(runtime) {
42
+ const scoped = await runtime.use(datasetDomain);
43
+ return new DatasetService(scoped.db);
44
+ }
45
+ /**
46
+ * proposeNotation — declare or ITERATE the formal notation of the dataset.
47
+ *
48
+ * The notation is the planning artifact: call it FIRST with the initial
49
+ * set definition derived from the resources, and call it AGAIN whenever
50
+ * the analysis discovers new sets, variables, constraints or corrections.
51
+ * Every call appends a revision (the discovery trail is preserved). Mark
52
+ * the last call with final=true so the notation describes the produced
53
+ * dataset; its checkable predicates get verified arithmetically after
54
+ * completion.
55
+ */
56
+ export function createProposeNotationTool({ datasetId, runtime }) {
57
+ return tool({
58
+ description: [
59
+ "Declare or refine the FORMAL NOTATION of the dataset: the dataset as a",
60
+ "set defined in LaTeX (set-builder, relational algebra, quantified",
61
+ "predicates) plus the symbols it binds and the predicates every row",
62
+ "satisfies. This is your PLANNING artifact — propose it before writing",
63
+ "any code, and revise it whenever the analysis discovers new sets,",
64
+ "variables or invariants. The latest final notation is verified",
65
+ "arithmetically against the produced rows (non-blocking).",
66
+ ].join(" "),
67
+ inputSchema: z.object({
68
+ latex: z
69
+ .string()
70
+ .describe("Main definition of the dataset as a set, in LaTeX. Example: 'D = \\\\{(w,r,t) \\\\mid t = \\\\sum_{o \\\\in Orders} o.amount,\\\\; o.status = paid\\\\}'"),
71
+ symbols: z.array(symbolSchema).describe("Symbols bound by the notation"),
72
+ predicates: z
73
+ .array(predicateSchema)
74
+ .describe("Claims about the dataset; include machine-checkable forms when possible"),
75
+ reason: z
76
+ .string()
77
+ .describe("What discovery triggered this revision (or 'initial proposal')"),
78
+ final: z
79
+ .boolean()
80
+ .optional()
81
+ .describe("true when this notation describes the dataset you are about to complete"),
82
+ }),
83
+ execute: async ({ latex, symbols, predicates, reason, final }) => {
84
+ try {
85
+ const service = await getDatasetService(runtime);
86
+ const existing = await service.getDatasetById(datasetId);
87
+ const previous = (existing.ok ? existing.data?.notation : null);
88
+ const parsedPredicates = [];
89
+ const checkErrors = [];
90
+ for (const predicate of predicates) {
91
+ let check;
92
+ if (predicate.checkJson) {
93
+ try {
94
+ check = JSON.parse(predicate.checkJson);
95
+ if (!check || typeof check !== "object" || !("kind" in check)) {
96
+ throw new Error("check must be an object with a 'kind'");
97
+ }
98
+ }
99
+ catch (error) {
100
+ checkErrors.push(`predicate ${predicate.id}: invalid checkJson (${String(error).slice(0, 80)})`);
101
+ check = undefined;
102
+ }
103
+ }
104
+ parsedPredicates.push({
105
+ id: predicate.id,
106
+ description: predicate.description,
107
+ latex: predicate.latex,
108
+ ...(check ? { check } : {}),
109
+ });
110
+ }
111
+ const notation = reviseDatasetNotation(previous, {
112
+ latex,
113
+ symbols: symbols,
114
+ predicates: parsedPredicates,
115
+ reason,
116
+ final,
117
+ });
118
+ const update = await service.updateDatasetNotation({ datasetId, notation });
119
+ if (!update.ok) {
120
+ return { success: false, error: update.error };
121
+ }
122
+ console.log(`[Dataset ${datasetId}] notation v${notation.version} (${notation.status}): ${reason}`);
123
+ return {
124
+ success: true,
125
+ version: notation.version,
126
+ status: notation.status,
127
+ ...(checkErrors.length
128
+ ? {
129
+ warning: `some checks were dropped: ${checkErrors.join("; ")}`,
130
+ }
131
+ : {}),
132
+ };
133
+ }
134
+ catch (error) {
135
+ return {
136
+ success: false,
137
+ error: error instanceof Error ? error.message : String(error),
138
+ };
139
+ }
140
+ },
141
+ });
142
+ }
@@ -1,5 +1,6 @@
1
1
  import { DatasetService } from "../service.js";
2
2
  import { createDatasetId } from "../id.js";
3
+ import { inferQueryNotation, verifyDatasetNotation } from "../notation.js";
3
4
  function normalizeRows(result) {
4
5
  if (!result || typeof result !== "object")
5
6
  return [];
@@ -61,6 +62,15 @@ export async function queryDomainStep(params) {
61
62
  const rows = normalizeRows(queryResult);
62
63
  const previewRows = rows.slice(0, 20);
63
64
  const schema = inferSchema(rows);
65
+ // query-backed datasets carry a fully deterministic formal notation:
66
+ // the set definition, its symbols and its checkable predicates derive
67
+ // mechanically from the query + rows; verification is immediate
68
+ const notation = verifyDatasetNotation(inferQueryNotation({
69
+ entityNames: Object.keys(params.query ?? {}),
70
+ rowCount: rows.length,
71
+ schema,
72
+ explanation: params.explanation,
73
+ }), rows);
64
74
  const createRes = await service.createDataset({
65
75
  id: datasetId,
66
76
  title: params.title ?? "domain.query",
@@ -68,6 +78,7 @@ export async function queryDomainStep(params) {
68
78
  instructions: params.explanation,
69
79
  analysis: { explanation: params.explanation, query: params.query },
70
80
  schema,
81
+ notation,
71
82
  createdAt: Date.now(),
72
83
  updatedAt: Date.now(),
73
84
  });
package/dist/schema.d.ts CHANGED
@@ -11,6 +11,8 @@ declare const entities: {
11
11
  instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
12
12
  analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
13
13
  schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
14
+ /** formal notation (LaTeX + checkable predicates) describing the set */
15
+ notation: import("@instantdb/core").DataAttrDef<any, false, false, false>;
14
16
  calculatedTotalRows: import("@instantdb/core").DataAttrDef<number, false, false, false>;
15
17
  actualGeneratedRowCount: import("@instantdb/core").DataAttrDef<number, false, false, false>;
16
18
  }, {}, void>;
package/dist/schema.js CHANGED
@@ -13,6 +13,8 @@ const entities = {
13
13
  instructions: i.string().optional(),
14
14
  analysis: i.json().optional(),
15
15
  schema: i.json().optional(),
16
+ /** formal notation (LaTeX + checkable predicates) describing the set */
17
+ notation: i.json().optional(),
16
18
  calculatedTotalRows: i.number().optional(),
17
19
  actualGeneratedRowCount: i.number().optional(),
18
20
  }),
package/dist/service.d.ts CHANGED
@@ -47,6 +47,10 @@ export declare class DatasetService {
47
47
  schema: any;
48
48
  status?: string;
49
49
  }): Promise<ServiceResult<void>>;
50
+ updateDatasetNotation(params: {
51
+ datasetId: string;
52
+ notation: Record<string, any>;
53
+ }): Promise<ServiceResult<void>>;
50
54
  updateDatasetStatus(params: {
51
55
  datasetId: string;
52
56
  status: string;
package/dist/service.js CHANGED
@@ -214,6 +214,24 @@ export class DatasetService {
214
214
  return { ok: false, error: message };
215
215
  }
216
216
  }
217
+ async updateDatasetNotation(params) {
218
+ try {
219
+ const resolved = await this.resolveDatasetEntityId(params.datasetId);
220
+ if (!resolved.ok)
221
+ return resolved;
222
+ await this.db.transact([
223
+ this.db.tx.dataset_datasets[resolved.data].update({
224
+ notation: params.notation,
225
+ updatedAt: Date.now(),
226
+ })
227
+ ]);
228
+ return { ok: true, data: undefined };
229
+ }
230
+ catch (error) {
231
+ const message = error instanceof Error ? error.message : String(error);
232
+ return { ok: false, error: message };
233
+ }
234
+ }
217
235
  async updateDatasetStatus(params) {
218
236
  try {
219
237
  const resolved = await this.resolveDatasetEntityId(params.datasetId);
@@ -102,11 +102,15 @@ function buildInstructions(context) {
102
102
  .ele("Action").txt(`Review ContextResources and any InputPreviews to understand current record structures, evidence, fields, shapes and edge cases. ${multipleInputsNote}`).up()
103
103
  .ele("Note").txt("ContextResources DescriptorJson may include inline text, metadata, previewRows, or other visible evidence. Treat that visible content as already available context. Do not use executeCommand only to reread it.").up()
104
104
  .up()
105
- .ele("Step", { number: "2", name: "Plan Mapping" })
105
+ .ele("Step", { number: "2", name: "Propose Formal Notation (PLAN FIRST)" })
106
+ .ele("Action").txt("Call proposeNotation with the formal definition of the OUTPUT dataset as a set derived from the input sets: e.g. D = \\pi_{fields}(\\sigma_{condition}(A \\bowtie B)) or set-builder with quantifiers, in LaTeX. Declare the input sets, bound variables and the predicates every output row satisfies.").up()
107
+ .ele("Note").txt("The notation is the planning artifact and comes BEFORE the transformation: it states which sets you draw from, how they combine (join, filter, project, aggregate) and which arithmetic invariants the output keeps (e.g. totals preserved across the transformation). Give predicates a machine-checkable checkJson whenever the claim is arithmetic (row counts, ranges, uniqueness, aggregates). ITERATE the notation whenever inspection of the inputs reveals new sets, variables or corrections, and call proposeNotation with final=true just before completing — it will be verified arithmetically against the produced rows.").up()
108
+ .up()
109
+ .ele("Step", { number: "3", name: "Plan Mapping" })
106
110
  .ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
107
111
  .ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
108
112
  .up()
109
- .ele("Step", { number: "3", name: "Transform" })
113
+ .ele("Step", { number: "4", name: "Transform" })
110
114
  .ele("Action").txt("For single-object output, use completeObject with the final object. For row output, use replaceRows with the final rows. Use executeCommand only when command execution is necessary, not merely convenient.").up()
111
115
  .ele("Requirement").txt("Do not call completeObject until you have constructed the complete data object. completeObject requires data; a summary-only call is invalid and wastes a model iteration.").up()
112
116
  .ele("Requirement").txt("Command execution is necessary only when the final output cannot be produced directly from the provided context, resource descriptors, or previews, and requires running code to inspect, parse, aggregate, join, or compute over files/resources.").up()
@@ -120,12 +124,13 @@ function buildInstructions(context) {
120
124
  .ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
121
125
  .ele("Requirement").txt("Do not install packages, download dependencies, or access the network from executeCommand. Use only the available runtime and standard library unless a dependency is already present.").up()
122
126
  .up()
123
- .ele("Step", { number: "4", name: "Validate and Complete" })
124
- .ele("Action").txt("When using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
127
+ .ele("Step", { number: "5", name: "Validate and Complete" })
128
+ .ele("Action").txt("Call proposeNotation with final=true (refined to match the produced output), then: when using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
125
129
  .ele("Behavior").txt("If any completion tool returns success:false, inspect validation details, repair the output, and call the appropriate completion tool again. Do not stop until a completion tool returns success:true.").up()
126
130
  .up()
127
131
  .up()
128
132
  .ele("Rules")
133
+ .ele("Rule").txt("The formal notation (proposeNotation) is the planning artifact: propose it before transforming, iterate it on every discovery, finalize it before completing. The LaTeX explains the dataset; the code merely produces it.").up()
129
134
  .ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
130
135
  .ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
131
136
  .ele("Rule").txt("Use the cheapest correct tool. completeObject and replaceRows are low-cost completion tools. executeCommand is a high-cost computation tool and requires an explicit commandDescription.").up()
@@ -4,6 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
4
4
  import { datasetUpdateSchemaStep } from "../dataset/steps.js";
5
5
  import { getDatasetOutputPath } from "../datasetFiles.js";
6
6
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { createProposeNotationTool } from "../proposeNotation.tool.js";
7
8
  import { createCompleteObjectTool, createReplaceRowsTool, } from "../writeDatasetRows.tool.js";
8
9
  import { buildTransformDatasetPromptStep, } from "./transform-dataset.steps.js";
9
10
  import { createDatasetId } from "../id.js";
@@ -136,6 +137,10 @@ function createTransformDatasetContextDefinition(params) {
136
137
  sandboxId,
137
138
  runtime,
138
139
  }),
140
+ proposeNotation: createProposeNotationTool({
141
+ datasetId,
142
+ runtime,
143
+ }),
139
144
  };
140
145
  })
141
146
  .shouldContinue(({ reactionEvent }) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.92-beta.development.0",
3
+ "version": "1.22.94-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.92-beta.development.0",
69
- "@ekairos/events": "^1.22.92-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.92-beta.development.0",
68
+ "@ekairos/domain": "^1.22.94-beta.development.0",
69
+ "@ekairos/events": "^1.22.94-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.94-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",