@ekairos/dataset 1.22.37 → 1.22.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import { DatasetService } from "../service.js";
2
2
  import { datasetDomain } from "../schema.js";
3
+ import { annotateNotationEvidence, inferQueryNotation, } from "../notation.js";
3
4
  import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
4
5
  import { inferDatasetSchema, validateRows } from "./schemaInference.js";
5
6
  import { rowsToJsonl } from "./rows.js";
@@ -77,6 +78,40 @@ export async function materializeRowsToDataset(runtime, params) {
77
78
  if (!statusResult.ok) {
78
79
  throw new Error(statusResult.error);
79
80
  }
81
+ // Formal notation, informative only (never blocks the build): a notation
82
+ // proposed during the build (agent iterations) gets advisory evidence
83
+ // against the materialized rows; query-backed builds with no proposed
84
+ // notation get the deterministic one derived from query + schema + rows.
85
+ try {
86
+ const existing = await service.getDatasetById(params.datasetId);
87
+ const previous = (existing.ok ? existing.data?.notation : null);
88
+ const analysis = (params.analysis ?? {});
89
+ const queryNotation = analysis.query && typeof analysis.query === "object"
90
+ ? inferQueryNotation({
91
+ entityNames: Object.keys(analysis.query),
92
+ rowCount: params.rows.length,
93
+ schema: resolvedSchema,
94
+ explanation: typeof analysis.explanation === "string" ? analysis.explanation : undefined,
95
+ })
96
+ : null;
97
+ // Query-backed builds are deterministic, so a freshly inferred notation
98
+ // always wins (a prior run's notation would be stale). Only agent-built
99
+ // datasets (no query) keep the notation the agent proposed during the
100
+ // build, which by now is the latest `previous`.
101
+ const candidate = queryNotation ??
102
+ (previous && Array.isArray(previous.predicates) && previous.predicates.length > 0
103
+ ? previous
104
+ : null);
105
+ if (candidate) {
106
+ await service.updateDatasetNotation({
107
+ datasetId: params.datasetId,
108
+ notation: annotateNotationEvidence(candidate, params.rows),
109
+ });
110
+ }
111
+ }
112
+ catch {
113
+ // notation must never affect the build result
114
+ }
80
115
  return params.datasetId;
81
116
  }
82
117
  export async function uploadInlineTextResource(runtime, datasetId, resource) {
@@ -112,10 +147,12 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
112
147
  });
113
148
  },
114
149
  };
150
+ const notation = (datasetResult.data?.notation ?? null);
115
151
  if (!withFirst) {
116
152
  return {
117
153
  datasetId,
118
154
  dataset: datasetResult.data,
155
+ notation,
119
156
  previewRows: previewResult.rows,
120
157
  reader,
121
158
  };
@@ -124,6 +161,7 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
124
161
  return {
125
162
  datasetId,
126
163
  dataset: datasetResult.data,
164
+ notation,
127
165
  previewRows: previewResult.rows,
128
166
  reader,
129
167
  firstRow: firstResult.row,
@@ -146,6 +184,7 @@ export function createDatasetBuildResult(runtime, params) {
146
184
  return {
147
185
  datasetId: params.datasetId,
148
186
  dataset: params.dataset,
187
+ notation: (params.dataset?.notation ?? null),
149
188
  previewRows: params.previewRows,
150
189
  reader,
151
190
  ...(params.firstRow !== undefined ? { firstRow: params.firstRow } : {}),
@@ -3,6 +3,7 @@ import type { DomainInstantSchema, DomainSchemaResult } from "@ekairos/domain";
3
3
  import type { EkairosRuntime, RuntimeForDomain } from "@ekairos/domain/runtime";
4
4
  import type { ContextIdentifier, ContextReactor, StoredContextResource } from "@ekairos/events";
5
5
  import { datasetDomain } from "../schema.js";
6
+ import type { DatasetNotation } from "../notation.js";
6
7
  export type DatasetQueryResourceInput<D extends DomainSchemaResult = DomainSchemaResult> = {
7
8
  query: InstaQLParams<DomainInstantSchema<D>>;
8
9
  title?: string;
@@ -72,6 +73,9 @@ export type DatasetReader = {
72
73
  export type DatasetBuildResult = {
73
74
  datasetId: string;
74
75
  dataset: any;
76
+ /** the formal definition (intensional face), co-equal with the rows */
77
+ notation: DatasetNotation | null;
78
+ /** preview of the materialization (extensional face) */
75
79
  previewRows: any[];
76
80
  reader: DatasetReader;
77
81
  object?: any | null;
@@ -1,5 +1,6 @@
1
1
  import Ajv from "ajv";
2
2
  import { getDatasetOutputPath } from "./datasetFiles.js";
3
+ import { annotateNotationEvidence } from "./notation.js";
3
4
  import { DatasetService } from "./service.js";
4
5
  import { getDatasetRuntimeDb } from "./dataset/steps.js";
5
6
  import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, } from "./sandbox/steps.js";
@@ -176,6 +177,19 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
176
177
  }
177
178
  console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
178
179
  console.log(`[Dataset ${datasetId}] ========================================`);
180
+ // Formal-notation evidence: advisory arithmetic annotation of the latest
181
+ // notation against the produced rows. Informative only — it never
182
+ // affects the dataset completion result or the dataset's validity.
183
+ try {
184
+ await annotateNotationFromJsonl({
185
+ service,
186
+ datasetId,
187
+ jsonlBase64: fileRead.contentBase64,
188
+ });
189
+ }
190
+ catch (error) {
191
+ console.error(`[Dataset ${datasetId}] notation annotation skipped:`, error instanceof Error ? error.message : String(error));
192
+ }
179
193
  return {
180
194
  success: true,
181
195
  status: "completed",
@@ -187,6 +201,42 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
187
201
  dataFileId: uploadResult.data.fileId,
188
202
  };
189
203
  }
204
+ const NOTATION_EVIDENCE_MAX_ROWS = 50000;
205
+ async function annotateNotationFromJsonl(params) {
206
+ const existing = await params.service.getDatasetById(params.datasetId);
207
+ const notation = (existing.ok ? existing.data?.notation : null);
208
+ if (!notation || !Array.isArray(notation.predicates) || notation.predicates.length === 0) {
209
+ return;
210
+ }
211
+ const rows = [];
212
+ const content = Buffer.from(params.jsonlBase64, "base64").toString("utf-8");
213
+ for (const line of content.split("\n")) {
214
+ const trimmed = line.trim();
215
+ if (!trimmed)
216
+ continue;
217
+ try {
218
+ const parsed = JSON.parse(trimmed);
219
+ if (parsed && parsed.type === "row") {
220
+ rows.push(parsed.data);
221
+ }
222
+ }
223
+ catch {
224
+ // malformed lines were already handled by schema validation
225
+ }
226
+ if (rows.length >= NOTATION_EVIDENCE_MAX_ROWS)
227
+ break;
228
+ }
229
+ const annotated = annotateNotationEvidence(notation, rows);
230
+ await params.service.updateDatasetNotation({
231
+ datasetId: params.datasetId,
232
+ notation: annotated,
233
+ });
234
+ const contradicted = (annotated.checks ?? []).filter((check) => check.status === "contradicted");
235
+ console.log(`[Dataset ${params.datasetId}] notation v${annotated.version} (${annotated.status})` +
236
+ (contradicted.length
237
+ ? ` — ${contradicted.length} predicado(s) con evidencia contraria (advisory)`
238
+ : ""));
239
+ }
190
240
  function resolveExecutionStoragePath(outputPath, datasetId) {
191
241
  const normalized = String(outputPath ?? "").replace(/\\/g, "/");
192
242
  const marker = "/tmp/ekairos/contexts/";
@@ -0,0 +1,49 @@
1
+ interface DefineNotationToolParams {
2
+ datasetId: string;
3
+ runtime: any;
4
+ }
5
+ /**
6
+ * defineNotation — author or REFINE the formal DEFINITION of the dataset.
7
+ *
8
+ * A dataset has two co-equal faces: its formal definition (the notation —
9
+ * the proposition that defines the set, in LaTeX) and its materialization
10
+ * (the rows + the code that produces them). They sit at the SAME level: the
11
+ * definition is not a side note about the data, it IS the dataset stated
12
+ * intensionally. The same notation is the PLAN (you state it first and the
13
+ * materialization realizes it) and, finalized, the RESULT (it describes what
14
+ * you produced).
15
+ *
16
+ * Call it FIRST with the initial definition derived from the resources, and
17
+ * AGAIN whenever the analysis discovers new sets, variables, constraints or
18
+ * corrections — every call keeps the prior version in history. Mark the last
19
+ * call with final=true so the definition describes the produced dataset.
20
+ * Predicates may be formal/semantic (trusted); the few that are arithmetic
21
+ * MAY carry optional advisory evidence.
22
+ */
23
+ export declare function createDefineNotationTool({ datasetId, runtime }: DefineNotationToolParams): import("ai").Tool<{
24
+ latex: string;
25
+ symbols: {
26
+ name: string;
27
+ kind: "function" | "set" | "variable" | "constant" | "predicate";
28
+ description: string;
29
+ latex?: string | undefined;
30
+ }[];
31
+ predicates: {
32
+ id: string;
33
+ description: string;
34
+ latex: string;
35
+ checkJson?: string | undefined;
36
+ }[];
37
+ reason: string;
38
+ final?: boolean | undefined;
39
+ }, {
40
+ success: boolean;
41
+ error: string;
42
+ } | {
43
+ warning?: string | undefined;
44
+ success: boolean;
45
+ version: number;
46
+ status: import("./notation.js").DatasetNotationStatus;
47
+ error?: undefined;
48
+ }>;
49
+ export {};
@@ -0,0 +1,154 @@
1
+ import { tool } from "ai";
2
+ import { z } from "zod";
3
+ import { DatasetService } from "./service.js";
4
+ import { datasetDomain } from "./schema.js";
5
+ import { reviseDatasetNotation, } from "./notation.js";
6
+ const symbolSchema = z.object({
7
+ name: z.string().describe("Plain identifier, e.g. 'D', 'Orders', 'w'"),
8
+ latex: z
9
+ .string()
10
+ .optional()
11
+ .describe("LaTeX for the symbol, e.g. '\\\\mathcal{D}' (defaults to the name)"),
12
+ kind: z.enum(["set", "variable", "function", "constant", "predicate"]),
13
+ description: z.string().describe("What this symbol denotes in the data"),
14
+ });
15
+ const predicateSchema = z.object({
16
+ id: z.string().describe("Stable id, e.g. 'p1', 'cardinality'"),
17
+ description: z.string().describe("The claim in plain language"),
18
+ latex: z
19
+ .string()
20
+ .describe("The claim in LaTeX, e.g. '\\\\forall r \\\\in D: r.amount > 0'"),
21
+ checkJson: z
22
+ .string()
23
+ .optional()
24
+ .describe([
25
+ "OPTIONAL arithmetic form of the claim as a JSON string, used only for",
26
+ "advisory evidence over the produced rows (not a verdict). Shapes:",
27
+ '{"kind":"row_count","op":"=","value":124}',
28
+ '{"kind":"field_type","field":"amount","type":"number","allowNull":true}',
29
+ '{"kind":"field_range","field":"amount","min":0}',
30
+ '{"kind":"field_in","field":"status","values":["paid","void"]}',
31
+ '{"kind":"field_nonnull","field":"orderId"}',
32
+ '{"kind":"field_matches","field":"sku","pattern":"^[A-Z0-9-]+$"}',
33
+ '{"kind":"unique","fields":["orderId"]}',
34
+ '{"kind":"aggregate","fn":"sum","field":"amount","op":">=","value":0}',
35
+ 'Propositional composition: {"kind":"and"|"or","checks":[...]},',
36
+ '{"kind":"not","check":...}, {"kind":"implies","if":...,"then":...}.',
37
+ "Fields support dot-paths into nested records (company.taxId).",
38
+ "Omit for formal/semantic claims (the normal case) — they are trusted.",
39
+ ].join(" ")),
40
+ });
41
+ async function getDatasetService(runtime) {
42
+ const scoped = await runtime.use(datasetDomain);
43
+ return new DatasetService(scoped.db);
44
+ }
45
+ /**
46
+ * defineNotation — author or REFINE the formal DEFINITION of the dataset.
47
+ *
48
+ * A dataset has two co-equal faces: its formal definition (the notation —
49
+ * the proposition that defines the set, in LaTeX) and its materialization
50
+ * (the rows + the code that produces them). They sit at the SAME level: the
51
+ * definition is not a side note about the data, it IS the dataset stated
52
+ * intensionally. The same notation is the PLAN (you state it first and the
53
+ * materialization realizes it) and, finalized, the RESULT (it describes what
54
+ * you produced).
55
+ *
56
+ * Call it FIRST with the initial definition derived from the resources, and
57
+ * AGAIN whenever the analysis discovers new sets, variables, constraints or
58
+ * corrections — every call keeps the prior version in history. Mark the last
59
+ * call with final=true so the definition describes the produced dataset.
60
+ * Predicates may be formal/semantic (trusted); the few that are arithmetic
61
+ * MAY carry optional advisory evidence.
62
+ */
63
+ export function createDefineNotationTool({ datasetId, runtime }) {
64
+ return tool({
65
+ description: [
66
+ "Author or refine the formal DEFINITION of the dataset: the dataset as a",
67
+ "set in LaTeX (set-builder, relational algebra, quantified or even",
68
+ "semantic predicates) plus the symbols it binds. This definition and the",
69
+ "materialization (rows + code) are TWO CO-EQUAL FACES of the dataset —",
70
+ "the definition is the dataset stated intensionally, not a comment on it.",
71
+ "It is your PLAN (state it before writing any code; the materialization",
72
+ "realizes it) and, once final, the RESULT (it describes what you",
73
+ "produced). The definition is a logical proposition, possibly derived —",
74
+ "it need not be mechanically provable; we trust the formality. State it",
75
+ "first, refine it on every discovery, and set final=true on the last",
76
+ "call. For the few predicates that are arithmetic you MAY attach a",
77
+ "checkJson for optional advisory evidence (non-blocking, never a verdict).",
78
+ ].join(" "),
79
+ inputSchema: z.object({
80
+ latex: z
81
+ .string()
82
+ .describe("Main definition of the dataset as a set, in LaTeX. Example: 'D = \\\\{(w,r,t) \\\\mid t = \\\\sum_{o \\\\in Orders} o.amount,\\\\; o.status = paid\\\\}'"),
83
+ symbols: z.array(symbolSchema).describe("Symbols bound by the definition"),
84
+ predicates: z
85
+ .array(predicateSchema)
86
+ .describe("Claims the set satisfies; attach a checkJson only when arithmetic"),
87
+ reason: z
88
+ .string()
89
+ .describe("What this revision states or what discovery triggered it (or 'initial definition')"),
90
+ final: z
91
+ .boolean()
92
+ .optional()
93
+ .describe("true when this definition describes the dataset you are about to complete (the RESULT)"),
94
+ }),
95
+ execute: async ({ latex, symbols, predicates, reason, final }) => {
96
+ try {
97
+ const service = await getDatasetService(runtime);
98
+ const existing = await service.getDatasetById(datasetId);
99
+ const previous = (existing.ok ? existing.data?.notation : null);
100
+ const parsedPredicates = [];
101
+ const checkErrors = [];
102
+ for (const predicate of predicates) {
103
+ let check;
104
+ if (predicate.checkJson) {
105
+ try {
106
+ check = JSON.parse(predicate.checkJson);
107
+ if (!check || typeof check !== "object" || !("kind" in check)) {
108
+ throw new Error("check must be an object with a 'kind'");
109
+ }
110
+ }
111
+ catch (error) {
112
+ checkErrors.push(`predicate ${predicate.id}: invalid checkJson (${String(error).slice(0, 80)})`);
113
+ check = undefined;
114
+ }
115
+ }
116
+ parsedPredicates.push({
117
+ id: predicate.id,
118
+ description: predicate.description,
119
+ latex: predicate.latex,
120
+ ...(check ? { check } : {}),
121
+ });
122
+ }
123
+ const notation = reviseDatasetNotation(previous, {
124
+ latex,
125
+ symbols: symbols,
126
+ predicates: parsedPredicates,
127
+ reason,
128
+ final,
129
+ });
130
+ const update = await service.updateDatasetNotation({ datasetId, notation });
131
+ if (!update.ok) {
132
+ return { success: false, error: update.error };
133
+ }
134
+ console.log(`[Dataset ${datasetId}] definition v${notation.version} (${notation.status}): ${reason}`);
135
+ return {
136
+ success: true,
137
+ version: notation.version,
138
+ status: notation.status,
139
+ ...(checkErrors.length
140
+ ? {
141
+ warning: `some checks were dropped: ${checkErrors.join("; ")}`,
142
+ }
143
+ : {}),
144
+ };
145
+ }
146
+ catch (error) {
147
+ return {
148
+ success: false,
149
+ error: error instanceof Error ? error.message : String(error),
150
+ };
151
+ }
152
+ },
153
+ });
154
+ }
@@ -3,6 +3,7 @@ import { createClearDatasetTool } from "../clearDataset.tool.js";
3
3
  import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
4
4
  import { datasetGetByIdStep } from "../dataset/steps.js";
5
5
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
6
+ import { createDefineNotationTool } from "../defineNotation.tool.js";
6
7
  import { createGenerateSchemaTool } from "./generateSchema.tool.js";
7
8
  import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
8
9
  import { createDatasetId } from "../id.js";
@@ -151,6 +152,10 @@ function createFileParseContextDefinition(params) {
151
152
  sandboxId,
152
153
  runtime,
153
154
  }),
155
+ defineNotation: createDefineNotationTool({
156
+ datasetId,
157
+ runtime,
158
+ }),
154
159
  };
155
160
  if (!existingSchema) {
156
161
  actions.generateSchema = createGenerateSchemaTool({
@@ -238,9 +238,21 @@ function buildInstructions(context) {
238
238
  .ele("Action").txt("Review the FilePreview section in Context to understand the file structure").up()
239
239
  .ele("Note").txt("FilePreview contains: TotalRows (total data rows), Metadata (file properties with JSON output), Head (first N raw file lines), Tail (last N lines if present), Mid (middle sample for large files). Each section shows Description, Script (full Python code), Command, Stdout (raw content), Stderr. This allows you to understand the exact file format.").up()
240
240
  .up();
241
+ xml = xml
242
+ .ele("Step", { number: "2", name: "Define the Dataset (PLAN FIRST)" })
243
+ .ele("Action").txt("Call defineNotation with the INITIAL formal definition of the dataset as a set, derived from the file preview: D = { r | r ∈ File ∧ <constraints> } in LaTeX, the symbols it binds (sets, variables, functions) and the predicates the set satisfies").up()
244
+ .ele("Requirements")
245
+ .ele("Requirement").txt("The definition and the materialization (schema + parsing code + rows) are TWO CO-EQUAL FACES of the dataset. The definition is the dataset stated intensionally — author it FIRST; it is your PLAN and the code is built to realize it").up()
246
+ .ele("Requirement").txt("Use set-builder notation, quantifiers and arithmetic in LaTeX (e.g. D = \\{(c, q, p) \\mid q \\in \\mathbb{Z}^{+},\\; p \\in \\mathbb{R}_{\\geq 0}\\})").up()
247
+ .ele("Requirement").txt("Declare every discovered set and variable as a symbol with a one-line meaning").up()
248
+ .ele("Requirement").txt("Predicates are formal claims we trust; they may be semantic (e.g. 'x es una frase relevante'). Only for the few that are purely arithmetic (row counts, field types, ranges, uniqueness, aggregates) you MAY add a checkJson for optional advisory evidence — leave every other claim without checkJson").up()
249
+ .ele("Requirement").txt("REFINE: every time the analysis discovers a new set, variable, constraint or correction (new columns, unexpected types, excluded sections), call defineNotation again with the updated definition and the reason. The definition is not fixed up front — discovery is the point").up()
250
+ .ele("Requirement").txt("Before calling completeDataset, call defineNotation one last time with final=true so the definition becomes the RESULT — it describes EXACTLY the dataset you produced; any arithmetic predicates get optional advisory evidence afterwards (never a pass/fail verdict — the dataset's validity is trusted)").up()
251
+ .up()
252
+ .up();
241
253
  if (hasProvidedSchema) {
242
254
  xml = xml
243
- .ele("Step", { number: "2", name: "Use Provided Schema" })
255
+ .ele("Step", { number: "3", name: "Use Provided Schema" })
244
256
  .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
245
257
  .ele("Requirements")
246
258
  .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
@@ -255,7 +267,7 @@ function buildInstructions(context) {
255
267
  }
256
268
  else {
257
269
  xml = xml
258
- .ele("Step", { number: "2", name: "Generate JSON Schema" })
270
+ .ele("Step", { number: "3", name: "Generate JSON Schema" })
259
271
  .ele("Action").txt("Call generateSchema to create a JSON Schema for a SINGLE DATA RECORD (one row of data)").up()
260
272
  .ele("Requirements")
261
273
  .ele("Requirement").txt("Schema describes ONE DATA RECORD structure only (type: object, not array)").up()
@@ -267,7 +279,7 @@ function buildInstructions(context) {
267
279
  .up();
268
280
  }
269
281
  xml = xml
270
- .ele("Step", { number: "3", name: "Generate Dataset JSONL" })
282
+ .ele("Step", { number: "4", name: "Generate Dataset JSONL" })
271
283
  .ele("Action").txt(`Use executeCommand to parse the file and generate output.jsonl in the dataset workstation`).up()
272
284
  .ele("Requirements")
273
285
  .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
@@ -279,12 +291,13 @@ function buildInstructions(context) {
279
291
  .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
280
292
  .up()
281
293
  .up()
282
- .ele("Step", { number: "4", name: "Complete and Validate" })
283
- .ele("Action").txt("Call completeDataset to validate the dataset").up()
294
+ .ele("Step", { number: "5", name: "Complete and Validate" })
295
+ .ele("Action").txt("Call defineNotation with final=true (the definition as RESULT, matching the produced rows), then call completeDataset to validate the dataset").up()
284
296
  .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
285
297
  .up()
286
298
  .up()
287
299
  .ele("Rules")
300
+ .ele("Rule").txt("The formal definition (defineNotation) and the materialization (schema + code + rows) are co-equal faces of the dataset: author the definition first as the PLAN, refine it on every discovery, finalize it as the RESULT before completion").up()
288
301
  .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
289
302
  .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
290
303
  .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
package/dist/index.d.ts CHANGED
@@ -2,6 +2,7 @@ export * from "./dataset.js";
2
2
  export * from "./contextWorkspace.js";
3
3
  export * from "./domain.js";
4
4
  export * from "./materializeDataset.tool.js";
5
+ export * from "./notation.js";
5
6
  export * from "./schema.js";
6
7
  export * from "./service.js";
7
8
  export { registerFileParseContext } from "./file/file-dataset.agent.js";
package/dist/index.js CHANGED
@@ -2,6 +2,7 @@ export * from "./dataset.js";
2
2
  export * from "./contextWorkspace.js";
3
3
  export * from "./domain.js";
4
4
  export * from "./materializeDataset.tool.js";
5
+ export * from "./notation.js";
5
6
  export * from "./schema.js";
6
7
  export * from "./service.js";
7
8
  export { registerFileParseContext } from "./file/file-dataset.agent.js";
@@ -18,8 +18,8 @@ declare const materializeDatasetToolInputSchema: z.ZodObject<{
18
18
  }, z.core.$strip>>>;
19
19
  texts: z.ZodOptional<z.ZodArray<z.ZodObject<{
20
20
  name: z.ZodOptional<z.ZodString>;
21
- text: z.ZodString;
22
21
  description: z.ZodOptional<z.ZodString>;
22
+ text: z.ZodString;
23
23
  mimeType: z.ZodOptional<z.ZodString>;
24
24
  }, z.core.$strip>>>;
25
25
  datasets: z.ZodOptional<z.ZodArray<z.ZodObject<{
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Formal notation for datasets — the dataset stated intensionally.
3
+ *
4
+ * A dataset has TWO CO-EQUAL FACES at the same level:
5
+ * - its formal DEFINITION (this notation: the proposition that defines the
6
+ * set, in LaTeX), and
7
+ * - its MATERIALIZATION (the rows + the code that produces them).
8
+ * The notation is not a comment about the data; it IS the dataset, written
9
+ * as a logical statement. The materialization is the same set written
10
+ * extensionally. Neither is subordinate to the other.
11
+ *
12
+ * The SAME notation plays two roles across the lifecycle: it is the PLAN
13
+ * (status "plan": stated first, the materialization is built to realize it)
14
+ * and, once finalized, the RESULT (status "result": it describes exactly
15
+ * what was produced). It is iterated in between — every revision keeps the
16
+ * prior version in `history`, so the discovery trail stays visible.
17
+ *
18
+ * The definition is a logical proposition, possibly DERIVED (a syllogism),
19
+ * so it is NOT, in general, mechanically verifiable: a predicate may be
20
+ * semantic ("x es una frase divertida") and the set is still well-formed.
21
+ * We TRUST the formality and the produced dataset — there is no verdict.
22
+ *
23
+ * SOME predicates happen to be arithmetic (a row count, a field type, a
24
+ * preserved total). For those, and only those, we attach OPTIONAL evidence
25
+ * computed over the rows. It is advisory: a contradiction is a hint, never
26
+ * a claim that the dataset is invalid. Predicates with no arithmetic form
27
+ * are "asserted" — trusted. Nothing here blocks or changes a build; the
28
+ * notation rides on dataset_datasets.notation.
29
+ */
30
+ export type DatasetNotationSymbolKind = "set" | "variable" | "function" | "constant" | "predicate";
31
+ export type DatasetNotationSymbol = {
32
+ /** plain identifier, e.g. "D", "w", "Orders" */
33
+ name: string;
34
+ /** LaTeX for the symbol, e.g. "\\mathcal{D}" (defaults to the name) */
35
+ latex?: string;
36
+ kind: DatasetNotationSymbolKind;
37
+ description: string;
38
+ };
39
+ export type NotationCmpOp = "=" | "!=" | "<" | "<=" | ">" | ">=";
40
+ /**
41
+ * OPTIONAL arithmetic evidence for the subset of predicates that happen to
42
+ * be mechanical (counts, types, ranges, totals). Evaluated over the rows;
43
+ * field access supports dot-paths into nested records ("company.taxId").
44
+ * Leaf checks are dataset-level propositions; and/or/not/implies compose
45
+ * them propositionally. A predicate WITHOUT a check is a formal/semantic
46
+ * claim we trust — that is the normal case, not an exception.
47
+ */
48
+ export type NotationCheck = {
49
+ kind: "row_count";
50
+ op: NotationCmpOp;
51
+ value: number;
52
+ } | {
53
+ kind: "field_type";
54
+ field: string;
55
+ type: "number" | "integer" | "string" | "boolean";
56
+ allowNull?: boolean;
57
+ } | {
58
+ kind: "field_range";
59
+ field: string;
60
+ min?: number;
61
+ max?: number;
62
+ } | {
63
+ kind: "field_in";
64
+ field: string;
65
+ values: Array<string | number | boolean>;
66
+ } | {
67
+ kind: "field_nonnull";
68
+ field: string;
69
+ } | {
70
+ kind: "field_matches";
71
+ field: string;
72
+ pattern: string;
73
+ } | {
74
+ kind: "unique";
75
+ fields: string[];
76
+ } | {
77
+ kind: "aggregate";
78
+ fn: "sum" | "count" | "min" | "max" | "avg";
79
+ /** omit for fn = "count" */
80
+ field?: string;
81
+ op: NotationCmpOp;
82
+ value: number;
83
+ /** absolute tolerance for float comparison (default 1e-9) */
84
+ tolerance?: number;
85
+ } | {
86
+ kind: "and";
87
+ checks: NotationCheck[];
88
+ } | {
89
+ kind: "or";
90
+ checks: NotationCheck[];
91
+ } | {
92
+ kind: "not";
93
+ check: NotationCheck;
94
+ } | {
95
+ kind: "implies";
96
+ if: NotationCheck;
97
+ then: NotationCheck;
98
+ };
99
+ export type DatasetNotationPredicate = {
100
+ /** stable id within the notation, e.g. "p1", "rowCount" */
101
+ id: string;
102
+ description: string;
103
+ /** the claim in LaTeX, e.g. "\\forall r \\in D:\\; r.amount > 0" */
104
+ latex: string;
105
+ /**
106
+ * OPTIONAL arithmetic form. Absent (the common case) = a formal/semantic
107
+ * claim we trust without mechanical checking.
108
+ */
109
+ check?: NotationCheck;
110
+ };
111
+ /**
112
+ * Advisory evidence for one predicate. Never a verdict on the dataset:
113
+ * - "asserted" formal/semantic claim, trusted, no mechanical check
114
+ * - "supported" arithmetic evidence agrees with the stated claim
115
+ * - "contradicted" arithmetic evidence disagrees — a hint, not a failure
116
+ */
117
+ export type DatasetNotationCheckResult = {
118
+ predicateId: string;
119
+ status: "asserted" | "supported" | "contradicted";
120
+ detail?: string;
121
+ };
122
+ export type DatasetNotationRevision = {
123
+ version: number;
124
+ latex: string;
125
+ /** why this revision happened — the discovery that triggered it */
126
+ reason: string;
127
+ at: number;
128
+ };
129
+ /**
130
+ * The role the notation currently plays — the two ends of its life:
131
+ * - "plan": stated before/while building; the materialization realizes it
132
+ * - "result": finalized; it describes the dataset that was produced
133
+ * There is intentionally NO "verified"/"violated" verdict — validity is
134
+ * trusted, not proven. Iteration is tracked by `version`/`history`; advisory
135
+ * arithmetic evidence lives in `checks`, separate from this role.
136
+ */
137
+ export type DatasetNotationStatus = "plan" | "result";
138
+ export type DatasetNotation = {
139
+ version: number;
140
+ status: DatasetNotationStatus;
141
+ /** the main definition: the dataset as a set, in LaTeX */
142
+ latex: string;
143
+ symbols: DatasetNotationSymbol[];
144
+ predicates: DatasetNotationPredicate[];
145
+ /** advisory per-predicate evidence (asserted/supported/contradicted) */
146
+ checks?: DatasetNotationCheckResult[];
147
+ /** when the advisory evidence was last computed */
148
+ evidenceAt?: number;
149
+ history: DatasetNotationRevision[];
150
+ };
151
+ export type NotationRevisionInput = {
152
+ latex: string;
153
+ symbols?: DatasetNotationSymbol[];
154
+ predicates?: DatasetNotationPredicate[];
155
+ reason: string;
156
+ /** "final" marks the notation as describing the produced dataset */
157
+ final?: boolean;
158
+ };
159
+ /**
160
+ * Iterate the notation: every revision bumps the version and appends to
161
+ * history, so the discovery trail (sets/variables found along the way)
162
+ * stays visible.
163
+ */
164
+ export declare function reviseDatasetNotation(previous: DatasetNotation | null | undefined, input: NotationRevisionInput): DatasetNotation;
165
+ /** escape an identifier for use inside \text{} */
166
+ export declare function latexIdentifier(name: string): string;
167
+ type JsonSchemaLike = {
168
+ title?: string;
169
+ schema?: Record<string, any>;
170
+ properties?: Record<string, any>;
171
+ };
172
+ /**
173
+ * A query-backed dataset has a complete deterministic description: the
174
+ * dataset is the image of a known query over a known domain. No model is
175
+ * involved, so here the formal definition and its predicates derive
176
+ * mechanically from the query, the inferred schema and the row count — and
177
+ * those predicates DO carry arithmetic evidence (the special case where the
178
+ * formal claims happen to be fully mechanical).
179
+ */
180
+ export declare function inferQueryNotation(params: {
181
+ entityNames: string[];
182
+ rowCount: number;
183
+ schema?: JsonSchemaLike | null;
184
+ explanation?: string;
185
+ }): DatasetNotation;
186
+ type CheckOutcome = {
187
+ ok: boolean;
188
+ detail: string;
189
+ };
190
+ export declare function evaluateNotationCheck(rows: any[], check: NotationCheck): CheckOutcome;
191
+ /**
192
+ * Annotate a notation with ADVISORY arithmetic evidence over the produced
193
+ * rows. Never throws, never blocks, and never changes the notation's
194
+ * lifecycle status — the dataset's validity is trusted, not proven here.
195
+ *
196
+ * Each predicate is reported as:
197
+ * - "asserted" no arithmetic form (formal/semantic claim, trusted)
198
+ * - "supported" arithmetic evidence agrees
199
+ * - "contradicted" arithmetic evidence disagrees (a hint to look, not a
200
+ * verdict that the dataset is wrong)
201
+ * A check that can't be evaluated stays "asserted" — we don't downgrade a
202
+ * trusted claim because of a malformed mechanical form.
203
+ */
204
+ export declare function annotateNotationEvidence(notation: DatasetNotation, rows: any[]): DatasetNotation;
205
+ export {};
@@ -0,0 +1,424 @@
1
+ /**
2
+ * Formal notation for datasets — the dataset stated intensionally.
3
+ *
4
+ * A dataset has TWO CO-EQUAL FACES at the same level:
5
+ * - its formal DEFINITION (this notation: the proposition that defines the
6
+ * set, in LaTeX), and
7
+ * - its MATERIALIZATION (the rows + the code that produces them).
8
+ * The notation is not a comment about the data; it IS the dataset, written
9
+ * as a logical statement. The materialization is the same set written
10
+ * extensionally. Neither is subordinate to the other.
11
+ *
12
+ * The SAME notation plays two roles across the lifecycle: it is the PLAN
13
+ * (status "plan": stated first, the materialization is built to realize it)
14
+ * and, once finalized, the RESULT (status "result": it describes exactly
15
+ * what was produced). It is iterated in between — every revision keeps the
16
+ * prior version in `history`, so the discovery trail stays visible.
17
+ *
18
+ * The definition is a logical proposition, possibly DERIVED (a syllogism),
19
+ * so it is NOT, in general, mechanically verifiable: a predicate may be
20
+ * semantic ("x es una frase divertida") and the set is still well-formed.
21
+ * We TRUST the formality and the produced dataset — there is no verdict.
22
+ *
23
+ * SOME predicates happen to be arithmetic (a row count, a field type, a
24
+ * preserved total). For those, and only those, we attach OPTIONAL evidence
25
+ * computed over the rows. It is advisory: a contradiction is a hint, never
26
+ * a claim that the dataset is invalid. Predicates with no arithmetic form
27
+ * are "asserted" — trusted. Nothing here blocks or changes a build; the
28
+ * notation rides on dataset_datasets.notation.
29
+ */
30
+ /**
31
+ * Iterate the notation: every revision bumps the version and appends to
32
+ * history, so the discovery trail (sets/variables found along the way)
33
+ * stays visible.
34
+ */
35
+ export function reviseDatasetNotation(previous, input) {
36
+ const version = (previous?.version ?? 0) + 1;
37
+ const revision = {
38
+ version,
39
+ latex: input.latex,
40
+ reason: input.reason,
41
+ at: Date.now(),
42
+ };
43
+ return {
44
+ version,
45
+ status: input.final ? "result" : "plan",
46
+ latex: input.latex,
47
+ symbols: input.symbols ?? previous?.symbols ?? [],
48
+ predicates: input.predicates ?? previous?.predicates ?? [],
49
+ history: [...(previous?.history ?? []), revision],
50
+ };
51
+ }
52
+ /* ── LaTeX helpers ──────────────────────────────────────────────── */
53
+ /** escape an identifier for use inside \text{} */
54
+ export function latexIdentifier(name) {
55
+ return `\\text{${String(name).replace(/([#$%&_{}])/g, "\\$1")}}`;
56
+ }
57
+ function latexFieldType(type) {
58
+ if (type === "number")
59
+ return "\\mathbb{R}";
60
+ if (type === "integer")
61
+ return "\\mathbb{Z}";
62
+ if (type === "boolean")
63
+ return "\\{\\top,\\bot\\}";
64
+ return "\\Sigma^{*}";
65
+ }
66
+ const JSON_SCHEMA_KEYWORDS = new Set([
67
+ "type",
68
+ "title",
69
+ "description",
70
+ "required",
71
+ "items",
72
+ "additionalProperties",
73
+ ]);
74
+ function schemaProperties(schema) {
75
+ const root = (schema?.schema ?? schema ?? {});
76
+ if (root.properties && typeof root.properties === "object") {
77
+ return root.properties;
78
+ }
79
+ // flat shape from query inference: { fieldName: "type", ... }
80
+ const flat = {};
81
+ for (const [key, value] of Object.entries(root)) {
82
+ if (JSON_SCHEMA_KEYWORDS.has(key))
83
+ continue;
84
+ if (typeof value === "string")
85
+ flat[key] = value;
86
+ else if (value && typeof value === "object" && typeof value.type === "string") {
87
+ flat[key] = value;
88
+ }
89
+ }
90
+ return flat;
91
+ }
92
+ /**
93
+ * A query-backed dataset has a complete deterministic description: the
94
+ * dataset is the image of a known query over a known domain. No model is
95
+ * involved, so here the formal definition and its predicates derive
96
+ * mechanically from the query, the inferred schema and the row count — and
97
+ * those predicates DO carry arithmetic evidence (the special case where the
98
+ * formal claims happen to be fully mechanical).
99
+ */
100
+ export function inferQueryNotation(params) {
101
+ const sources = params.entityNames.length ? params.entityNames : ["Domain"];
102
+ const sourceSymbols = sources.map((name) => ({
103
+ name,
104
+ latex: latexIdentifier(name),
105
+ kind: "set",
106
+ description: `Entidad de origen ${name}`,
107
+ }));
108
+ const union = sources.map((name) => latexIdentifier(name)).join(" \\cup ");
109
+ const latex = `\\mathcal{D} = \\left\\{\\, r \\;\\middle|\\; r \\in Q\\!\\left(${union}\\right) \\right\\}`;
110
+ const properties = schemaProperties(params.schema);
111
+ const predicates = [
112
+ {
113
+ id: "cardinality",
114
+ description: `El dataset tiene exactamente ${params.rowCount} filas`,
115
+ latex: `|\\mathcal{D}| = ${params.rowCount}`,
116
+ check: { kind: "row_count", op: "=", value: params.rowCount },
117
+ },
118
+ ];
119
+ for (const [field, raw] of Object.entries(properties)) {
120
+ const type = typeof raw === "string" ? raw : String(raw?.type ?? "");
121
+ if (!["number", "integer", "boolean", "string"].includes(type))
122
+ continue;
123
+ predicates.push({
124
+ id: `type_${field}`,
125
+ description: `Toda fila tiene ${field} de tipo ${type} (o nulo)`,
126
+ latex: `\\forall r \\in \\mathcal{D}:\\; r.${latexIdentifier(field)} \\in ${latexFieldType(type)} \\cup \\{\\varnothing\\}`,
127
+ check: {
128
+ kind: "field_type",
129
+ field,
130
+ type: type,
131
+ allowNull: true,
132
+ },
133
+ });
134
+ }
135
+ return reviseDatasetNotation(null, {
136
+ latex,
137
+ symbols: [
138
+ {
139
+ name: "D",
140
+ latex: "\\mathcal{D}",
141
+ kind: "set",
142
+ description: params.explanation?.trim() || "Dataset materializado",
143
+ },
144
+ {
145
+ name: "Q",
146
+ kind: "function",
147
+ description: "Consulta InstaQL aplicada al dominio",
148
+ },
149
+ ...sourceSymbols,
150
+ ],
151
+ predicates,
152
+ reason: "Notación determinística derivada de la consulta al dominio",
153
+ final: true,
154
+ });
155
+ }
156
+ /* ── arithmetic evaluation ──────────────────────────────────────── */
157
+ function readPath(row, path) {
158
+ let current = row;
159
+ for (const segment of String(path).split(".")) {
160
+ if (current === null || current === undefined)
161
+ return undefined;
162
+ current = current[segment];
163
+ }
164
+ return current;
165
+ }
166
+ function compare(op, left, right, tolerance = 0) {
167
+ switch (op) {
168
+ case "=":
169
+ return Math.abs(left - right) <= tolerance;
170
+ case "!=":
171
+ return Math.abs(left - right) > tolerance;
172
+ case "<":
173
+ return left < right;
174
+ case "<=":
175
+ return left <= right + tolerance;
176
+ case ">":
177
+ return left > right;
178
+ case ">=":
179
+ return left >= right - tolerance;
180
+ }
181
+ }
182
+ export function evaluateNotationCheck(rows, check) {
183
+ switch (check.kind) {
184
+ case "row_count": {
185
+ const ok = compare(check.op, rows.length, check.value);
186
+ return { ok, detail: `|D| = ${rows.length} ${check.op} ${check.value}` };
187
+ }
188
+ case "field_type": {
189
+ let failures = 0;
190
+ let firstFailure = "";
191
+ for (const row of rows) {
192
+ const value = readPath(row, check.field);
193
+ if (value === null || value === undefined) {
194
+ if (check.allowNull)
195
+ continue;
196
+ failures += 1;
197
+ if (!firstFailure)
198
+ firstFailure = "null";
199
+ continue;
200
+ }
201
+ const okValue = check.type === "number"
202
+ ? typeof value === "number" && Number.isFinite(value)
203
+ : check.type === "integer"
204
+ ? typeof value === "number" && Number.isInteger(value)
205
+ : check.type === "boolean"
206
+ ? typeof value === "boolean"
207
+ : typeof value === "string";
208
+ if (!okValue) {
209
+ failures += 1;
210
+ if (!firstFailure)
211
+ firstFailure = JSON.stringify(value)?.slice(0, 40) ?? "?";
212
+ }
213
+ }
214
+ return {
215
+ ok: failures === 0,
216
+ detail: failures === 0
217
+ ? `∀r: ${check.field} : ${check.type}`
218
+ : `${failures}/${rows.length} filas violan ${check.field} : ${check.type} (ej: ${firstFailure})`,
219
+ };
220
+ }
221
+ case "field_range": {
222
+ let failures = 0;
223
+ for (const row of rows) {
224
+ const value = readPath(row, check.field);
225
+ if (typeof value !== "number" || Number.isNaN(value))
226
+ continue;
227
+ if (check.min !== undefined && value < check.min)
228
+ failures += 1;
229
+ else if (check.max !== undefined && value > check.max)
230
+ failures += 1;
231
+ }
232
+ const bounds = [
233
+ check.min !== undefined ? `≥ ${check.min}` : "",
234
+ check.max !== undefined ? `≤ ${check.max}` : "",
235
+ ]
236
+ .filter(Boolean)
237
+ .join(" ∧ ");
238
+ return {
239
+ ok: failures === 0,
240
+ detail: failures === 0
241
+ ? `∀r: ${check.field} ${bounds}`
242
+ : `${failures}/${rows.length} filas fuera de rango en ${check.field}`,
243
+ };
244
+ }
245
+ case "field_in": {
246
+ const allowed = new Set(check.values.map((value) => JSON.stringify(value)));
247
+ let failures = 0;
248
+ for (const row of rows) {
249
+ const value = readPath(row, check.field);
250
+ if (value === null || value === undefined)
251
+ continue;
252
+ if (!allowed.has(JSON.stringify(value)))
253
+ failures += 1;
254
+ }
255
+ return {
256
+ ok: failures === 0,
257
+ detail: failures === 0
258
+ ? `∀r: ${check.field} ∈ {${check.values.join(", ")}}`
259
+ : `${failures}/${rows.length} filas con ${check.field} fuera del conjunto`,
260
+ };
261
+ }
262
+ case "field_nonnull": {
263
+ let failures = 0;
264
+ for (const row of rows) {
265
+ const value = readPath(row, check.field);
266
+ if (value === null || value === undefined || value === "")
267
+ failures += 1;
268
+ }
269
+ return {
270
+ ok: failures === 0,
271
+ detail: failures === 0
272
+ ? `∀r: ${check.field} ≠ ∅`
273
+ : `${failures}/${rows.length} filas con ${check.field} vacío`,
274
+ };
275
+ }
276
+ case "field_matches": {
277
+ let regex;
278
+ try {
279
+ regex = new RegExp(check.pattern);
280
+ }
281
+ catch {
282
+ return { ok: false, detail: `patrón inválido: ${check.pattern}` };
283
+ }
284
+ let failures = 0;
285
+ for (const row of rows) {
286
+ const value = readPath(row, check.field);
287
+ if (typeof value !== "string")
288
+ continue;
289
+ if (!regex.test(value))
290
+ failures += 1;
291
+ }
292
+ return {
293
+ ok: failures === 0,
294
+ detail: failures === 0
295
+ ? `∀r: ${check.field} ~ /${check.pattern}/`
296
+ : `${failures}/${rows.length} filas no matchean /${check.pattern}/`,
297
+ };
298
+ }
299
+ case "unique": {
300
+ const seen = new Set();
301
+ let duplicates = 0;
302
+ for (const row of rows) {
303
+ const key = JSON.stringify(check.fields.map((field) => readPath(row, field)));
304
+ if (seen.has(key))
305
+ duplicates += 1;
306
+ else
307
+ seen.add(key);
308
+ }
309
+ return {
310
+ ok: duplicates === 0,
311
+ detail: duplicates === 0
312
+ ? `(${check.fields.join(", ")}) es clave`
313
+ : `${duplicates} duplicados sobre (${check.fields.join(", ")})`,
314
+ };
315
+ }
316
+ case "aggregate": {
317
+ const values = [];
318
+ for (const row of rows) {
319
+ if (check.fn === "count" && !check.field)
320
+ continue;
321
+ const value = readPath(row, String(check.field));
322
+ if (typeof value === "number" && Number.isFinite(value))
323
+ values.push(value);
324
+ }
325
+ let actual;
326
+ switch (check.fn) {
327
+ case "count":
328
+ actual = check.field ? values.length : rows.length;
329
+ break;
330
+ case "sum":
331
+ actual = values.reduce((total, value) => total + value, 0);
332
+ break;
333
+ case "min":
334
+ actual = values.length ? Math.min(...values) : Number.NaN;
335
+ break;
336
+ case "max":
337
+ actual = values.length ? Math.max(...values) : Number.NaN;
338
+ break;
339
+ case "avg":
340
+ actual = values.length
341
+ ? values.reduce((total, value) => total + value, 0) / values.length
342
+ : Number.NaN;
343
+ break;
344
+ }
345
+ const tolerance = check.tolerance ?? 1e-9;
346
+ const ok = Number.isFinite(actual) && compare(check.op, actual, check.value, tolerance);
347
+ return {
348
+ ok,
349
+ detail: `${check.fn}(${check.field ?? "*"}) = ${Number.isFinite(actual) ? actual : "∅"} ${check.op} ${check.value}`,
350
+ };
351
+ }
352
+ case "and": {
353
+ const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
354
+ return {
355
+ ok: results.every((result) => result.ok),
356
+ detail: results.map((result) => result.detail).join(" ∧ "),
357
+ };
358
+ }
359
+ case "or": {
360
+ const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
361
+ return {
362
+ ok: results.some((result) => result.ok),
363
+ detail: results.map((result) => result.detail).join(" ∨ "),
364
+ };
365
+ }
366
+ case "not": {
367
+ const result = evaluateNotationCheck(rows, check.check);
368
+ return { ok: !result.ok, detail: `¬(${result.detail})` };
369
+ }
370
+ case "implies": {
371
+ const antecedent = evaluateNotationCheck(rows, check.if);
372
+ if (!antecedent.ok) {
373
+ return { ok: true, detail: `(${antecedent.detail}) → ⊤ (antecedente falso)` };
374
+ }
375
+ const consequent = evaluateNotationCheck(rows, check.then);
376
+ return {
377
+ ok: consequent.ok,
378
+ detail: `(${antecedent.detail}) → (${consequent.detail})`,
379
+ };
380
+ }
381
+ }
382
+ }
383
+ /**
384
+ * Annotate a notation with ADVISORY arithmetic evidence over the produced
385
+ * rows. Never throws, never blocks, and never changes the notation's
386
+ * lifecycle status — the dataset's validity is trusted, not proven here.
387
+ *
388
+ * Each predicate is reported as:
389
+ * - "asserted" no arithmetic form (formal/semantic claim, trusted)
390
+ * - "supported" arithmetic evidence agrees
391
+ * - "contradicted" arithmetic evidence disagrees (a hint to look, not a
392
+ * verdict that the dataset is wrong)
393
+ * A check that can't be evaluated stays "asserted" — we don't downgrade a
394
+ * trusted claim because of a malformed mechanical form.
395
+ */
396
+ export function annotateNotationEvidence(notation, rows) {
397
+ const checks = [];
398
+ for (const predicate of notation.predicates ?? []) {
399
+ if (!predicate.check) {
400
+ checks.push({ predicateId: predicate.id, status: "asserted" });
401
+ continue;
402
+ }
403
+ try {
404
+ const outcome = evaluateNotationCheck(rows, predicate.check);
405
+ checks.push({
406
+ predicateId: predicate.id,
407
+ status: outcome.ok ? "supported" : "contradicted",
408
+ detail: outcome.detail,
409
+ });
410
+ }
411
+ catch (error) {
412
+ checks.push({
413
+ predicateId: predicate.id,
414
+ status: "asserted",
415
+ detail: `no evaluable: ${String(error).slice(0, 120)}`,
416
+ });
417
+ }
418
+ }
419
+ return {
420
+ ...notation,
421
+ checks,
422
+ evidenceAt: Date.now(),
423
+ };
424
+ }
@@ -1,5 +1,6 @@
1
1
  import { DatasetService } from "../service.js";
2
2
  import { createDatasetId } from "../id.js";
3
+ import { annotateNotationEvidence, inferQueryNotation } from "../notation.js";
3
4
  function normalizeRows(result) {
4
5
  if (!result || typeof result !== "object")
5
6
  return [];
@@ -61,6 +62,15 @@ export async function queryDomainStep(params) {
61
62
  const rows = normalizeRows(queryResult);
62
63
  const previewRows = rows.slice(0, 20);
63
64
  const schema = inferSchema(rows);
65
+ // query-backed datasets carry a fully deterministic formal notation:
66
+ // the set definition, its symbols and its predicates derive mechanically
67
+ // from the query + rows, so their arithmetic evidence is immediate
68
+ const notation = annotateNotationEvidence(inferQueryNotation({
69
+ entityNames: Object.keys(params.query ?? {}),
70
+ rowCount: rows.length,
71
+ schema,
72
+ explanation: params.explanation,
73
+ }), rows);
64
74
  const createRes = await service.createDataset({
65
75
  id: datasetId,
66
76
  title: params.title ?? "domain.query",
@@ -68,6 +78,7 @@ export async function queryDomainStep(params) {
68
78
  instructions: params.explanation,
69
79
  analysis: { explanation: params.explanation, query: params.query },
70
80
  schema,
81
+ notation,
71
82
  createdAt: Date.now(),
72
83
  updatedAt: Date.now(),
73
84
  });
package/dist/schema.d.ts CHANGED
@@ -11,6 +11,8 @@ declare const entities: {
11
11
  instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
12
12
  analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
13
13
  schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
14
+ /** formal notation (LaTeX + checkable predicates) describing the set */
15
+ notation: import("@instantdb/core").DataAttrDef<any, false, false, false>;
14
16
  calculatedTotalRows: import("@instantdb/core").DataAttrDef<number, false, false, false>;
15
17
  actualGeneratedRowCount: import("@instantdb/core").DataAttrDef<number, false, false, false>;
16
18
  }, {}, void>;
package/dist/schema.js CHANGED
@@ -13,6 +13,8 @@ const entities = {
13
13
  instructions: i.string().optional(),
14
14
  analysis: i.json().optional(),
15
15
  schema: i.json().optional(),
16
+ /** formal notation (LaTeX + checkable predicates) describing the set */
17
+ notation: i.json().optional(),
16
18
  calculatedTotalRows: i.number().optional(),
17
19
  actualGeneratedRowCount: i.number().optional(),
18
20
  }),
package/dist/service.d.ts CHANGED
@@ -47,6 +47,10 @@ export declare class DatasetService {
47
47
  schema: any;
48
48
  status?: string;
49
49
  }): Promise<ServiceResult<void>>;
50
+ updateDatasetNotation(params: {
51
+ datasetId: string;
52
+ notation: Record<string, any>;
53
+ }): Promise<ServiceResult<void>>;
50
54
  updateDatasetStatus(params: {
51
55
  datasetId: string;
52
56
  status: string;
package/dist/service.js CHANGED
@@ -214,6 +214,24 @@ export class DatasetService {
214
214
  return { ok: false, error: message };
215
215
  }
216
216
  }
217
+ async updateDatasetNotation(params) {
218
+ try {
219
+ const resolved = await this.resolveDatasetEntityId(params.datasetId);
220
+ if (!resolved.ok)
221
+ return resolved;
222
+ await this.db.transact([
223
+ this.db.tx.dataset_datasets[resolved.data].update({
224
+ notation: params.notation,
225
+ updatedAt: Date.now(),
226
+ })
227
+ ]);
228
+ return { ok: true, data: undefined };
229
+ }
230
+ catch (error) {
231
+ const message = error instanceof Error ? error.message : String(error);
232
+ return { ok: false, error: message };
233
+ }
234
+ }
217
235
  async updateDatasetStatus(params) {
218
236
  try {
219
237
  const resolved = await this.resolveDatasetEntityId(params.datasetId);
@@ -102,11 +102,15 @@ function buildInstructions(context) {
102
102
  .ele("Action").txt(`Review ContextResources and any InputPreviews to understand current record structures, evidence, fields, shapes and edge cases. ${multipleInputsNote}`).up()
103
103
  .ele("Note").txt("ContextResources DescriptorJson may include inline text, metadata, previewRows, or other visible evidence. Treat that visible content as already available context. Do not use executeCommand only to reread it.").up()
104
104
  .up()
105
- .ele("Step", { number: "2", name: "Plan Mapping" })
105
+ .ele("Step", { number: "2", name: "Define the Output Dataset (PLAN FIRST)" })
106
+ .ele("Action").txt("Call defineNotation with the formal definition of the OUTPUT dataset as a set derived from the input sets: e.g. D = \\pi_{fields}(\\sigma_{condition}(A \\bowtie B)) or set-builder with quantifiers, in LaTeX. Declare the input sets, bound variables and the predicates the output set satisfies.").up()
107
+ .ele("Note").txt("The definition and the materialization (the transform code + output rows) are TWO CO-EQUAL FACES of the dataset; author the definition FIRST as the PLAN: it states which sets you draw from, how they combine (join, filter, project, aggregate) and which invariants the output keeps (e.g. totals preserved). The definition is a formal proposition we trust — predicates may be semantic. Only for purely arithmetic invariants you MAY add a checkJson for optional advisory evidence. REFINE the definition whenever inspection of the inputs reveals new sets, variables or corrections, and call defineNotation with final=true just before completing — as the RESULT it describes the produced output; any arithmetic predicates then get advisory evidence (never a verdict).").up()
108
+ .up()
109
+ .ele("Step", { number: "3", name: "Plan Mapping" })
106
110
  .ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
107
111
  .ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
108
112
  .up()
109
- .ele("Step", { number: "3", name: "Transform" })
113
+ .ele("Step", { number: "4", name: "Transform" })
110
114
  .ele("Action").txt("For single-object output, use completeObject with the final object. For row output, use replaceRows with the final rows. Use executeCommand only when command execution is necessary, not merely convenient.").up()
111
115
  .ele("Requirement").txt("Do not call completeObject until you have constructed the complete data object. completeObject requires data; a summary-only call is invalid and wastes a model iteration.").up()
112
116
  .ele("Requirement").txt("Command execution is necessary only when the final output cannot be produced directly from the provided context, resource descriptors, or previews, and requires running code to inspect, parse, aggregate, join, or compute over files/resources.").up()
@@ -120,12 +124,13 @@ function buildInstructions(context) {
120
124
  .ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
121
125
  .ele("Requirement").txt("Do not install packages, download dependencies, or access the network from executeCommand. Use only the available runtime and standard library unless a dependency is already present.").up()
122
126
  .up()
123
- .ele("Step", { number: "4", name: "Validate and Complete" })
124
- .ele("Action").txt("When using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
127
+ .ele("Step", { number: "5", name: "Validate and Complete" })
128
+ .ele("Action").txt("Call defineNotation with final=true (the definition as RESULT, matching the produced output), then: when using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
125
129
  .ele("Behavior").txt("If any completion tool returns success:false, inspect validation details, repair the output, and call the appropriate completion tool again. Do not stop until a completion tool returns success:true.").up()
126
130
  .up()
127
131
  .up()
128
132
  .ele("Rules")
133
+ .ele("Rule").txt("The formal definition (defineNotation) and the materialization (transform code + output rows) are co-equal faces of the dataset: author the definition first as the PLAN, refine it on every discovery, finalize it as the RESULT before completing.").up()
129
134
  .ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
130
135
  .ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
131
136
  .ele("Rule").txt("Use the cheapest correct tool. completeObject and replaceRows are low-cost completion tools. executeCommand is a high-cost computation tool and requires an explicit commandDescription.").up()
@@ -4,6 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
4
4
  import { datasetUpdateSchemaStep } from "../dataset/steps.js";
5
5
  import { getDatasetOutputPath } from "../datasetFiles.js";
6
6
  import { createExecuteCommandTool } from "../executeCommand.tool.js";
7
+ import { createDefineNotationTool } from "../defineNotation.tool.js";
7
8
  import { createCompleteObjectTool, createReplaceRowsTool, } from "../writeDatasetRows.tool.js";
8
9
  import { buildTransformDatasetPromptStep, } from "./transform-dataset.steps.js";
9
10
  import { createDatasetId } from "../id.js";
@@ -136,6 +137,10 @@ function createTransformDatasetContextDefinition(params) {
136
137
  sandboxId,
137
138
  runtime,
138
139
  }),
140
+ defineNotation: createDefineNotationTool({
141
+ datasetId,
142
+ runtime,
143
+ }),
139
144
  };
140
145
  })
141
146
  .shouldContinue(({ reactionEvent }) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.37",
3
+ "version": "1.22.38",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.37",
69
- "@ekairos/events": "^1.22.37",
70
- "@ekairos/sandbox": "^1.22.37",
68
+ "@ekairos/domain": "^1.22.38",
69
+ "@ekairos/events": "^1.22.38",
70
+ "@ekairos/sandbox": "^1.22.38",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",