npm - @ekairos/dataset - Versions diffs - 1.22.37 → 1.22.38 - Mend

@ekairos/dataset 1.22.37 → 1.22.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/builder/persistence.js +39 -0
package/dist/builder/types.d.ts +4 -0
package/dist/completeDataset.steps.js +50 -0
package/dist/defineNotation.tool.d.ts +49 -0
package/dist/defineNotation.tool.js +154 -0
package/dist/file/file-dataset.agent.js +5 -0
package/dist/file/prompts.js +18 -5
package/dist/index.d.ts +1 -0
package/dist/index.js +1 -0
package/dist/materializeDataset.tool.d.ts +1 -1
package/dist/notation.d.ts +205 -0
package/dist/notation.js +424 -0
package/dist/query/queryDomain.step.js +11 -0
package/dist/schema.d.ts +2 -0
package/dist/schema.js +2 -0
package/dist/service.d.ts +4 -0
package/dist/service.js +18 -0
package/dist/transform/prompts.js +9 -4
package/dist/transform/transform-dataset.agent.js +5 -0
package/package.json +4 -4

package/dist/builder/persistence.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { DatasetService } from "../service.js";
 import { datasetDomain } from "../schema.js";
+import { annotateNotationEvidence, inferQueryNotation, } from "../notation.js";
 import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
 import { inferDatasetSchema, validateRows } from "./schemaInference.js";
 import { rowsToJsonl } from "./rows.js";
@@ -77,6 +78,40 @@ export async function materializeRowsToDataset(runtime, params) {
     if (!statusResult.ok) {
         throw new Error(statusResult.error);
     }
+    // Formal notation, informative only (never blocks the build): a notation
+    // proposed during the build (agent iterations) gets advisory evidence
+    // against the materialized rows; query-backed builds with no proposed
+    // notation get the deterministic one derived from query + schema + rows.
+    try {
+        const existing = await service.getDatasetById(params.datasetId);
+        const previous = (existing.ok ? existing.data?.notation : null);
+        const analysis = (params.analysis ?? {});
+        const queryNotation = analysis.query && typeof analysis.query === "object"
+            ? inferQueryNotation({
+                entityNames: Object.keys(analysis.query),
+                rowCount: params.rows.length,
+                schema: resolvedSchema,
+                explanation: typeof analysis.explanation === "string" ? analysis.explanation : undefined,
+            })
+            : null;
+        // Query-backed builds are deterministic, so a freshly inferred notation
+        // always wins (a prior run's notation would be stale). Only agent-built
+        // datasets (no query) keep the notation the agent proposed during the
+        // build, which by now is the latest `previous`.
+        const candidate = queryNotation ??
+            (previous && Array.isArray(previous.predicates) && previous.predicates.length > 0
+                ? previous
+                : null);
+        if (candidate) {
+            await service.updateDatasetNotation({
+                datasetId: params.datasetId,
+                notation: annotateNotationEvidence(candidate, params.rows),
+            });
+        }
+    }
+    catch {
+        // notation must never affect the build result
+    }
     return params.datasetId;
 }
 export async function uploadInlineTextResource(runtime, datasetId, resource) {
@@ -112,10 +147,12 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
             });
         },
     };
+    const notation = (datasetResult.data?.notation ?? null);
     if (!withFirst) {
         return {
             datasetId,
             dataset: datasetResult.data,
+            notation,
             previewRows: previewResult.rows,
             reader,
         };
@@ -124,6 +161,7 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
     return {
         datasetId,
         dataset: datasetResult.data,
+        notation,
         previewRows: previewResult.rows,
         reader,
         firstRow: firstResult.row,
@@ -146,6 +184,7 @@ export function createDatasetBuildResult(runtime, params) {
     return {
         datasetId: params.datasetId,
         dataset: params.dataset,
+        notation: (params.dataset?.notation ?? null),
         previewRows: params.previewRows,
         reader,
         ...(params.firstRow !== undefined ? { firstRow: params.firstRow } : {}),

package/dist/builder/types.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import type { DomainInstantSchema, DomainSchemaResult } from "@ekairos/domain";
 import type { EkairosRuntime, RuntimeForDomain } from "@ekairos/domain/runtime";
 import type { ContextIdentifier, ContextReactor, StoredContextResource } from "@ekairos/events";
 import { datasetDomain } from "../schema.js";
+import type { DatasetNotation } from "../notation.js";
 export type DatasetQueryResourceInput<D extends DomainSchemaResult = DomainSchemaResult> = {
     query: InstaQLParams<DomainInstantSchema<D>>;
     title?: string;
@@ -72,6 +73,9 @@ export type DatasetReader = {
 export type DatasetBuildResult = {
     datasetId: string;
     dataset: any;
+    /** the formal definition (intensional face), co-equal with the rows */
+    notation: DatasetNotation | null;
+    /** preview of the materialization (extensional face) */
     previewRows: any[];
     reader: DatasetReader;
     object?: any | null;

package/dist/completeDataset.steps.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import Ajv from "ajv";
 import { getDatasetOutputPath } from "./datasetFiles.js";
+import { annotateNotationEvidence } from "./notation.js";
 import { DatasetService } from "./service.js";
 import { getDatasetRuntimeDb } from "./dataset/steps.js";
 import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, } from "./sandbox/steps.js";
@@ -176,6 +177,19 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
     }
     console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
     console.log(`[Dataset ${datasetId}] ========================================`);
+    // Formal-notation evidence: advisory arithmetic annotation of the latest
+    // notation against the produced rows. Informative only — it never
+    // affects the dataset completion result or the dataset's validity.
+    try {
+        await annotateNotationFromJsonl({
+            service,
+            datasetId,
+            jsonlBase64: fileRead.contentBase64,
+        });
+    }
+    catch (error) {
+        console.error(`[Dataset ${datasetId}] notation annotation skipped:`, error instanceof Error ? error.message : String(error));
+    }
     return {
         success: true,
         status: "completed",
@@ -187,6 +201,42 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
         dataFileId: uploadResult.data.fileId,
     };
 }
+const NOTATION_EVIDENCE_MAX_ROWS = 50000;
+async function annotateNotationFromJsonl(params) {
+    const existing = await params.service.getDatasetById(params.datasetId);
+    const notation = (existing.ok ? existing.data?.notation : null);
+    if (!notation || !Array.isArray(notation.predicates) || notation.predicates.length === 0) {
+        return;
+    }
+    const rows = [];
+    const content = Buffer.from(params.jsonlBase64, "base64").toString("utf-8");
+    for (const line of content.split("\n")) {
+        const trimmed = line.trim();
+        if (!trimmed)
+            continue;
+        try {
+            const parsed = JSON.parse(trimmed);
+            if (parsed && parsed.type === "row") {
+                rows.push(parsed.data);
+            }
+        }
+        catch {
+            // malformed lines were already handled by schema validation
+        }
+        if (rows.length >= NOTATION_EVIDENCE_MAX_ROWS)
+            break;
+    }
+    const annotated = annotateNotationEvidence(notation, rows);
+    await params.service.updateDatasetNotation({
+        datasetId: params.datasetId,
+        notation: annotated,
+    });
+    const contradicted = (annotated.checks ?? []).filter((check) => check.status === "contradicted");
+    console.log(`[Dataset ${params.datasetId}] notation v${annotated.version} (${annotated.status})` +
+        (contradicted.length
+            ? ` — ${contradicted.length} predicado(s) con evidencia contraria (advisory)`
+            : ""));
+}
 function resolveExecutionStoragePath(outputPath, datasetId) {
     const normalized = String(outputPath ?? "").replace(/\\/g, "/");
     const marker = "/tmp/ekairos/contexts/";

package/dist/defineNotation.tool.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+interface DefineNotationToolParams {
+    datasetId: string;
+    runtime: any;
+}
+/**
+ * defineNotation — author or REFINE the formal DEFINITION of the dataset.
+ *
+ * A dataset has two co-equal faces: its formal definition (the notation —
+ * the proposition that defines the set, in LaTeX) and its materialization
+ * (the rows + the code that produces them). They sit at the SAME level: the
+ * definition is not a side note about the data, it IS the dataset stated
+ * intensionally. The same notation is the PLAN (you state it first and the
+ * materialization realizes it) and, finalized, the RESULT (it describes what
+ * you produced).
+ *
+ * Call it FIRST with the initial definition derived from the resources, and
+ * AGAIN whenever the analysis discovers new sets, variables, constraints or
+ * corrections — every call keeps the prior version in history. Mark the last
+ * call with final=true so the definition describes the produced dataset.
+ * Predicates may be formal/semantic (trusted); the few that are arithmetic
+ * MAY carry optional advisory evidence.
+ */
+export declare function createDefineNotationTool({ datasetId, runtime }: DefineNotationToolParams): import("ai").Tool<{
+    latex: string;
+    symbols: {
+        name: string;
+        kind: "function" | "set" | "variable" | "constant" | "predicate";
+        description: string;
+        latex?: string | undefined;
+    }[];
+    predicates: {
+        id: string;
+        description: string;
+        latex: string;
+        checkJson?: string | undefined;
+    }[];
+    reason: string;
+    final?: boolean | undefined;
+}, {
+    success: boolean;
+    error: string;
+} | {
+    warning?: string | undefined;
+    success: boolean;
+    version: number;
+    status: import("./notation.js").DatasetNotationStatus;
+    error?: undefined;
+}>;
+export {};

package/dist/defineNotation.tool.js ADDED Viewed

@@ -0,0 +1,154 @@
+import { tool } from "ai";
+import { z } from "zod";
+import { DatasetService } from "./service.js";
+import { datasetDomain } from "./schema.js";
+import { reviseDatasetNotation, } from "./notation.js";
+const symbolSchema = z.object({
+    name: z.string().describe("Plain identifier, e.g. 'D', 'Orders', 'w'"),
+    latex: z
+        .string()
+        .optional()
+        .describe("LaTeX for the symbol, e.g. '\\\\mathcal{D}' (defaults to the name)"),
+    kind: z.enum(["set", "variable", "function", "constant", "predicate"]),
+    description: z.string().describe("What this symbol denotes in the data"),
+});
+const predicateSchema = z.object({
+    id: z.string().describe("Stable id, e.g. 'p1', 'cardinality'"),
+    description: z.string().describe("The claim in plain language"),
+    latex: z
+        .string()
+        .describe("The claim in LaTeX, e.g. '\\\\forall r \\\\in D: r.amount > 0'"),
+    checkJson: z
+        .string()
+        .optional()
+        .describe([
+        "OPTIONAL arithmetic form of the claim as a JSON string, used only for",
+        "advisory evidence over the produced rows (not a verdict). Shapes:",
+        '{"kind":"row_count","op":"=","value":124}',
+        '{"kind":"field_type","field":"amount","type":"number","allowNull":true}',
+        '{"kind":"field_range","field":"amount","min":0}',
+        '{"kind":"field_in","field":"status","values":["paid","void"]}',
+        '{"kind":"field_nonnull","field":"orderId"}',
+        '{"kind":"field_matches","field":"sku","pattern":"^[A-Z0-9-]+$"}',
+        '{"kind":"unique","fields":["orderId"]}',
+        '{"kind":"aggregate","fn":"sum","field":"amount","op":">=","value":0}',
+        'Propositional composition: {"kind":"and"|"or","checks":[...]},',
+        '{"kind":"not","check":...}, {"kind":"implies","if":...,"then":...}.',
+        "Fields support dot-paths into nested records (company.taxId).",
+        "Omit for formal/semantic claims (the normal case) — they are trusted.",
+    ].join(" ")),
+});
+async function getDatasetService(runtime) {
+    const scoped = await runtime.use(datasetDomain);
+    return new DatasetService(scoped.db);
+}
+/**
+ * defineNotation — author or REFINE the formal DEFINITION of the dataset.
+ *
+ * A dataset has two co-equal faces: its formal definition (the notation —
+ * the proposition that defines the set, in LaTeX) and its materialization
+ * (the rows + the code that produces them). They sit at the SAME level: the
+ * definition is not a side note about the data, it IS the dataset stated
+ * intensionally. The same notation is the PLAN (you state it first and the
+ * materialization realizes it) and, finalized, the RESULT (it describes what
+ * you produced).
+ *
+ * Call it FIRST with the initial definition derived from the resources, and
+ * AGAIN whenever the analysis discovers new sets, variables, constraints or
+ * corrections — every call keeps the prior version in history. Mark the last
+ * call with final=true so the definition describes the produced dataset.
+ * Predicates may be formal/semantic (trusted); the few that are arithmetic
+ * MAY carry optional advisory evidence.
+ */
+export function createDefineNotationTool({ datasetId, runtime }) {
+    return tool({
+        description: [
+            "Author or refine the formal DEFINITION of the dataset: the dataset as a",
+            "set in LaTeX (set-builder, relational algebra, quantified or even",
+            "semantic predicates) plus the symbols it binds. This definition and the",
+            "materialization (rows + code) are TWO CO-EQUAL FACES of the dataset —",
+            "the definition is the dataset stated intensionally, not a comment on it.",
+            "It is your PLAN (state it before writing any code; the materialization",
+            "realizes it) and, once final, the RESULT (it describes what you",
+            "produced). The definition is a logical proposition, possibly derived —",
+            "it need not be mechanically provable; we trust the formality. State it",
+            "first, refine it on every discovery, and set final=true on the last",
+            "call. For the few predicates that are arithmetic you MAY attach a",
+            "checkJson for optional advisory evidence (non-blocking, never a verdict).",
+        ].join(" "),
+        inputSchema: z.object({
+            latex: z
+                .string()
+                .describe("Main definition of the dataset as a set, in LaTeX. Example: 'D = \\\\{(w,r,t) \\\\mid t = \\\\sum_{o \\\\in Orders} o.amount,\\\\; o.status = paid\\\\}'"),
+            symbols: z.array(symbolSchema).describe("Symbols bound by the definition"),
+            predicates: z
+                .array(predicateSchema)
+                .describe("Claims the set satisfies; attach a checkJson only when arithmetic"),
+            reason: z
+                .string()
+                .describe("What this revision states or what discovery triggered it (or 'initial definition')"),
+            final: z
+                .boolean()
+                .optional()
+                .describe("true when this definition describes the dataset you are about to complete (the RESULT)"),
+        }),
+        execute: async ({ latex, symbols, predicates, reason, final }) => {
+            try {
+                const service = await getDatasetService(runtime);
+                const existing = await service.getDatasetById(datasetId);
+                const previous = (existing.ok ? existing.data?.notation : null);
+                const parsedPredicates = [];
+                const checkErrors = [];
+                for (const predicate of predicates) {
+                    let check;
+                    if (predicate.checkJson) {
+                        try {
+                            check = JSON.parse(predicate.checkJson);
+                            if (!check || typeof check !== "object" || !("kind" in check)) {
+                                throw new Error("check must be an object with a 'kind'");
+                            }
+                        }
+                        catch (error) {
+                            checkErrors.push(`predicate ${predicate.id}: invalid checkJson (${String(error).slice(0, 80)})`);
+                            check = undefined;
+                        }
+                    }
+                    parsedPredicates.push({
+                        id: predicate.id,
+                        description: predicate.description,
+                        latex: predicate.latex,
+                        ...(check ? { check } : {}),
+                    });
+                }
+                const notation = reviseDatasetNotation(previous, {
+                    latex,
+                    symbols: symbols,
+                    predicates: parsedPredicates,
+                    reason,
+                    final,
+                });
+                const update = await service.updateDatasetNotation({ datasetId, notation });
+                if (!update.ok) {
+                    return { success: false, error: update.error };
+                }
+                console.log(`[Dataset ${datasetId}] definition v${notation.version} (${notation.status}): ${reason}`);
+                return {
+                    success: true,
+                    version: notation.version,
+                    status: notation.status,
+                    ...(checkErrors.length
+                        ? {
+                            warning: `some checks were dropped: ${checkErrors.join("; ")}`,
+                        }
+                        : {}),
+                };
+            }
+            catch (error) {
+                return {
+                    success: false,
+                    error: error instanceof Error ? error.message : String(error),
+                };
+            }
+        },
+    });
+}

package/dist/file/file-dataset.agent.js CHANGED Viewed

@@ -3,6 +3,7 @@ import { createClearDatasetTool } from "../clearDataset.tool.js";
 import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
 import { datasetGetByIdStep } from "../dataset/steps.js";
 import { createExecuteCommandTool } from "../executeCommand.tool.js";
+import { createDefineNotationTool } from "../defineNotation.tool.js";
 import { createGenerateSchemaTool } from "./generateSchema.tool.js";
 import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
 import { createDatasetId } from "../id.js";
@@ -151,6 +152,10 @@ function createFileParseContextDefinition(params) {
                 sandboxId,
                 runtime,
             }),
+            defineNotation: createDefineNotationTool({
+                datasetId,
+                runtime,
+            }),
         };
         if (!existingSchema) {
             actions.generateSchema = createGenerateSchemaTool({

package/dist/file/prompts.js CHANGED Viewed

@@ -238,9 +238,21 @@ function buildInstructions(context) {
         .ele("Action").txt("Review the FilePreview section in Context to understand the file structure").up()
         .ele("Note").txt("FilePreview contains: TotalRows (total data rows), Metadata (file properties with JSON output), Head (first N raw file lines), Tail (last N lines if present), Mid (middle sample for large files). Each section shows Description, Script (full Python code), Command, Stdout (raw content), Stderr. This allows you to understand the exact file format.").up()
         .up();
+    xml = xml
+        .ele("Step", { number: "2", name: "Define the Dataset (PLAN FIRST)" })
+        .ele("Action").txt("Call defineNotation with the INITIAL formal definition of the dataset as a set, derived from the file preview: D = { r | r ∈ File ∧ <constraints> } in LaTeX, the symbols it binds (sets, variables, functions) and the predicates the set satisfies").up()
+        .ele("Requirements")
+        .ele("Requirement").txt("The definition and the materialization (schema + parsing code + rows) are TWO CO-EQUAL FACES of the dataset. The definition is the dataset stated intensionally — author it FIRST; it is your PLAN and the code is built to realize it").up()
+        .ele("Requirement").txt("Use set-builder notation, quantifiers and arithmetic in LaTeX (e.g. D = \\{(c, q, p) \\mid q \\in \\mathbb{Z}^{+},\\; p \\in \\mathbb{R}_{\\geq 0}\\})").up()
+        .ele("Requirement").txt("Declare every discovered set and variable as a symbol with a one-line meaning").up()
+        .ele("Requirement").txt("Predicates are formal claims we trust; they may be semantic (e.g. 'x es una frase relevante'). Only for the few that are purely arithmetic (row counts, field types, ranges, uniqueness, aggregates) you MAY add a checkJson for optional advisory evidence — leave every other claim without checkJson").up()
+        .ele("Requirement").txt("REFINE: every time the analysis discovers a new set, variable, constraint or correction (new columns, unexpected types, excluded sections), call defineNotation again with the updated definition and the reason. The definition is not fixed up front — discovery is the point").up()
+        .ele("Requirement").txt("Before calling completeDataset, call defineNotation one last time with final=true so the definition becomes the RESULT — it describes EXACTLY the dataset you produced; any arithmetic predicates get optional advisory evidence afterwards (never a pass/fail verdict — the dataset's validity is trusted)").up()
+        .up()
+        .up();
     if (hasProvidedSchema) {
         xml = xml
-            .ele("Step", { number: "2", name: "Use Provided Schema" })
+            .ele("Step", { number: "3", name: "Use Provided Schema" })
             .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
             .ele("Requirements")
             .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
@@ -255,7 +267,7 @@ function buildInstructions(context) {
     }
     else {
         xml = xml
-            .ele("Step", { number: "2", name: "Generate JSON Schema" })
+            .ele("Step", { number: "3", name: "Generate JSON Schema" })
             .ele("Action").txt("Call generateSchema to create a JSON Schema for a SINGLE DATA RECORD (one row of data)").up()
             .ele("Requirements")
             .ele("Requirement").txt("Schema describes ONE DATA RECORD structure only (type: object, not array)").up()
@@ -267,7 +279,7 @@ function buildInstructions(context) {
             .up();
     }
     xml = xml
-        .ele("Step", { number: "3", name: "Generate Dataset JSONL" })
+        .ele("Step", { number: "4", name: "Generate Dataset JSONL" })
         .ele("Action").txt(`Use executeCommand to parse the file and generate output.jsonl in the dataset workstation`).up()
         .ele("Requirements")
         .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
@@ -279,12 +291,13 @@ function buildInstructions(context) {
         .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
         .up()
         .up()
-        .ele("Step", { number: "4", name: "Complete and Validate" })
-        .ele("Action").txt("Call completeDataset to validate the dataset").up()
+        .ele("Step", { number: "5", name: "Complete and Validate" })
+        .ele("Action").txt("Call defineNotation with final=true (the definition as RESULT, matching the produced rows), then call completeDataset to validate the dataset").up()
         .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
         .up()
         .up()
         .ele("Rules")
+        .ele("Rule").txt("The formal definition (defineNotation) and the materialization (schema + code + rows) are co-equal faces of the dataset: author the definition first as the PLAN, refine it on every discovery, finalize it as the RESULT before completion").up()
         .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
         .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
         .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()

package/dist/index.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./dataset.js";
 export * from "./contextWorkspace.js";
 export * from "./domain.js";
 export * from "./materializeDataset.tool.js";
+export * from "./notation.js";
 export * from "./schema.js";
 export * from "./service.js";
 export { registerFileParseContext } from "./file/file-dataset.agent.js";

package/dist/index.js CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./dataset.js";
 export * from "./contextWorkspace.js";
 export * from "./domain.js";
 export * from "./materializeDataset.tool.js";
+export * from "./notation.js";
 export * from "./schema.js";
 export * from "./service.js";
 export { registerFileParseContext } from "./file/file-dataset.agent.js";

package/dist/materializeDataset.tool.d.ts CHANGED Viewed

@@ -18,8 +18,8 @@ declare const materializeDatasetToolInputSchema: z.ZodObject<{
     }, z.core.$strip>>>;
     texts: z.ZodOptional<z.ZodArray<z.ZodObject<{
         name: z.ZodOptional<z.ZodString>;
-        text: z.ZodString;
         description: z.ZodOptional<z.ZodString>;
+        text: z.ZodString;
         mimeType: z.ZodOptional<z.ZodString>;
     }, z.core.$strip>>>;
     datasets: z.ZodOptional<z.ZodArray<z.ZodObject<{

package/dist/notation.d.ts ADDED Viewed

@@ -0,0 +1,205 @@
+/**
+ * Formal notation for datasets — the dataset stated intensionally.
+ *
+ * A dataset has TWO CO-EQUAL FACES at the same level:
+ * - its formal DEFINITION (this notation: the proposition that defines the
+ *   set, in LaTeX), and
+ * - its MATERIALIZATION (the rows + the code that produces them).
+ * The notation is not a comment about the data; it IS the dataset, written
+ * as a logical statement. The materialization is the same set written
+ * extensionally. Neither is subordinate to the other.
+ *
+ * The SAME notation plays two roles across the lifecycle: it is the PLAN
+ * (status "plan": stated first, the materialization is built to realize it)
+ * and, once finalized, the RESULT (status "result": it describes exactly
+ * what was produced). It is iterated in between — every revision keeps the
+ * prior version in `history`, so the discovery trail stays visible.
+ *
+ * The definition is a logical proposition, possibly DERIVED (a syllogism),
+ * so it is NOT, in general, mechanically verifiable: a predicate may be
+ * semantic ("x es una frase divertida") and the set is still well-formed.
+ * We TRUST the formality and the produced dataset — there is no verdict.
+ *
+ * SOME predicates happen to be arithmetic (a row count, a field type, a
+ * preserved total). For those, and only those, we attach OPTIONAL evidence
+ * computed over the rows. It is advisory: a contradiction is a hint, never
+ * a claim that the dataset is invalid. Predicates with no arithmetic form
+ * are "asserted" — trusted. Nothing here blocks or changes a build; the
+ * notation rides on dataset_datasets.notation.
+ */
+export type DatasetNotationSymbolKind = "set" | "variable" | "function" | "constant" | "predicate";
+export type DatasetNotationSymbol = {
+    /** plain identifier, e.g. "D", "w", "Orders" */
+    name: string;
+    /** LaTeX for the symbol, e.g. "\\mathcal{D}" (defaults to the name) */
+    latex?: string;
+    kind: DatasetNotationSymbolKind;
+    description: string;
+};
+export type NotationCmpOp = "=" | "!=" | "<" | "<=" | ">" | ">=";
+/**
+ * OPTIONAL arithmetic evidence for the subset of predicates that happen to
+ * be mechanical (counts, types, ranges, totals). Evaluated over the rows;
+ * field access supports dot-paths into nested records ("company.taxId").
+ * Leaf checks are dataset-level propositions; and/or/not/implies compose
+ * them propositionally. A predicate WITHOUT a check is a formal/semantic
+ * claim we trust — that is the normal case, not an exception.
+ */
+export type NotationCheck = {
+    kind: "row_count";
+    op: NotationCmpOp;
+    value: number;
+} | {
+    kind: "field_type";
+    field: string;
+    type: "number" | "integer" | "string" | "boolean";
+    allowNull?: boolean;
+} | {
+    kind: "field_range";
+    field: string;
+    min?: number;
+    max?: number;
+} | {
+    kind: "field_in";
+    field: string;
+    values: Array<string | number | boolean>;
+} | {
+    kind: "field_nonnull";
+    field: string;
+} | {
+    kind: "field_matches";
+    field: string;
+    pattern: string;
+} | {
+    kind: "unique";
+    fields: string[];
+} | {
+    kind: "aggregate";
+    fn: "sum" | "count" | "min" | "max" | "avg";
+    /** omit for fn = "count" */
+    field?: string;
+    op: NotationCmpOp;
+    value: number;
+    /** absolute tolerance for float comparison (default 1e-9) */
+    tolerance?: number;
+} | {
+    kind: "and";
+    checks: NotationCheck[];
+} | {
+    kind: "or";
+    checks: NotationCheck[];
+} | {
+    kind: "not";
+    check: NotationCheck;
+} | {
+    kind: "implies";
+    if: NotationCheck;
+    then: NotationCheck;
+};
+export type DatasetNotationPredicate = {
+    /** stable id within the notation, e.g. "p1", "rowCount" */
+    id: string;
+    description: string;
+    /** the claim in LaTeX, e.g. "\\forall r \\in D:\\; r.amount > 0" */
+    latex: string;
+    /**
+     * OPTIONAL arithmetic form. Absent (the common case) = a formal/semantic
+     * claim we trust without mechanical checking.
+     */
+    check?: NotationCheck;
+};
+/**
+ * Advisory evidence for one predicate. Never a verdict on the dataset:
+ * - "asserted"     formal/semantic claim, trusted, no mechanical check
+ * - "supported"    arithmetic evidence agrees with the stated claim
+ * - "contradicted" arithmetic evidence disagrees — a hint, not a failure
+ */
+export type DatasetNotationCheckResult = {
+    predicateId: string;
+    status: "asserted" | "supported" | "contradicted";
+    detail?: string;
+};
+export type DatasetNotationRevision = {
+    version: number;
+    latex: string;
+    /** why this revision happened — the discovery that triggered it */
+    reason: string;
+    at: number;
+};
+/**
+ * The role the notation currently plays — the two ends of its life:
+ * - "plan":   stated before/while building; the materialization realizes it
+ * - "result": finalized; it describes the dataset that was produced
+ * There is intentionally NO "verified"/"violated" verdict — validity is
+ * trusted, not proven. Iteration is tracked by `version`/`history`; advisory
+ * arithmetic evidence lives in `checks`, separate from this role.
+ */
+export type DatasetNotationStatus = "plan" | "result";
+export type DatasetNotation = {
+    version: number;
+    status: DatasetNotationStatus;
+    /** the main definition: the dataset as a set, in LaTeX */
+    latex: string;
+    symbols: DatasetNotationSymbol[];
+    predicates: DatasetNotationPredicate[];
+    /** advisory per-predicate evidence (asserted/supported/contradicted) */
+    checks?: DatasetNotationCheckResult[];
+    /** when the advisory evidence was last computed */
+    evidenceAt?: number;
+    history: DatasetNotationRevision[];
+};
+export type NotationRevisionInput = {
+    latex: string;
+    symbols?: DatasetNotationSymbol[];
+    predicates?: DatasetNotationPredicate[];
+    reason: string;
+    /** "final" marks the notation as describing the produced dataset */
+    final?: boolean;
+};
+/**
+ * Iterate the notation: every revision bumps the version and appends to
+ * history, so the discovery trail (sets/variables found along the way)
+ * stays visible.
+ */
+export declare function reviseDatasetNotation(previous: DatasetNotation | null | undefined, input: NotationRevisionInput): DatasetNotation;
+/** escape an identifier for use inside \text{} */
+export declare function latexIdentifier(name: string): string;
+type JsonSchemaLike = {
+    title?: string;
+    schema?: Record<string, any>;
+    properties?: Record<string, any>;
+};
+/**
+ * A query-backed dataset has a complete deterministic description: the
+ * dataset is the image of a known query over a known domain. No model is
+ * involved, so here the formal definition and its predicates derive
+ * mechanically from the query, the inferred schema and the row count — and
+ * those predicates DO carry arithmetic evidence (the special case where the
+ * formal claims happen to be fully mechanical).
+ */
+export declare function inferQueryNotation(params: {
+    entityNames: string[];
+    rowCount: number;
+    schema?: JsonSchemaLike | null;
+    explanation?: string;
+}): DatasetNotation;
+type CheckOutcome = {
+    ok: boolean;
+    detail: string;
+};
+export declare function evaluateNotationCheck(rows: any[], check: NotationCheck): CheckOutcome;
+/**
+ * Annotate a notation with ADVISORY arithmetic evidence over the produced
+ * rows. Never throws, never blocks, and never changes the notation's
+ * lifecycle status — the dataset's validity is trusted, not proven here.
+ *
+ * Each predicate is reported as:
+ * - "asserted"     no arithmetic form (formal/semantic claim, trusted)
+ * - "supported"    arithmetic evidence agrees
+ * - "contradicted" arithmetic evidence disagrees (a hint to look, not a
+ *                  verdict that the dataset is wrong)
+ * A check that can't be evaluated stays "asserted" — we don't downgrade a
+ * trusted claim because of a malformed mechanical form.
+ */
+export declare function annotateNotationEvidence(notation: DatasetNotation, rows: any[]): DatasetNotation;
+export {};

package/dist/notation.js ADDED Viewed

@@ -0,0 +1,424 @@
+/**
+ * Formal notation for datasets — the dataset stated intensionally.
+ *
+ * A dataset has TWO CO-EQUAL FACES at the same level:
+ * - its formal DEFINITION (this notation: the proposition that defines the
+ *   set, in LaTeX), and
+ * - its MATERIALIZATION (the rows + the code that produces them).
+ * The notation is not a comment about the data; it IS the dataset, written
+ * as a logical statement. The materialization is the same set written
+ * extensionally. Neither is subordinate to the other.
+ *
+ * The SAME notation plays two roles across the lifecycle: it is the PLAN
+ * (status "plan": stated first, the materialization is built to realize it)
+ * and, once finalized, the RESULT (status "result": it describes exactly
+ * what was produced). It is iterated in between — every revision keeps the
+ * prior version in `history`, so the discovery trail stays visible.
+ *
+ * The definition is a logical proposition, possibly DERIVED (a syllogism),
+ * so it is NOT, in general, mechanically verifiable: a predicate may be
+ * semantic ("x es una frase divertida") and the set is still well-formed.
+ * We TRUST the formality and the produced dataset — there is no verdict.
+ *
+ * SOME predicates happen to be arithmetic (a row count, a field type, a
+ * preserved total). For those, and only those, we attach OPTIONAL evidence
+ * computed over the rows. It is advisory: a contradiction is a hint, never
+ * a claim that the dataset is invalid. Predicates with no arithmetic form
+ * are "asserted" — trusted. Nothing here blocks or changes a build; the
+ * notation rides on dataset_datasets.notation.
+ */
+/**
+ * Iterate the notation: every revision bumps the version and appends to
+ * history, so the discovery trail (sets/variables found along the way)
+ * stays visible.
+ */
+export function reviseDatasetNotation(previous, input) {
+    const version = (previous?.version ?? 0) + 1;
+    const revision = {
+        version,
+        latex: input.latex,
+        reason: input.reason,
+        at: Date.now(),
+    };
+    return {
+        version,
+        status: input.final ? "result" : "plan",
+        latex: input.latex,
+        symbols: input.symbols ?? previous?.symbols ?? [],
+        predicates: input.predicates ?? previous?.predicates ?? [],
+        history: [...(previous?.history ?? []), revision],
+    };
+}
+/* ── LaTeX helpers ──────────────────────────────────────────────── */
+/** escape an identifier for use inside \text{} */
+export function latexIdentifier(name) {
+    return `\\text{${String(name).replace(/([#$%&_{}])/g, "\\$1")}}`;
+}
+function latexFieldType(type) {
+    if (type === "number")
+        return "\\mathbb{R}";
+    if (type === "integer")
+        return "\\mathbb{Z}";
+    if (type === "boolean")
+        return "\\{\\top,\\bot\\}";
+    return "\\Sigma^{*}";
+}
+const JSON_SCHEMA_KEYWORDS = new Set([
+    "type",
+    "title",
+    "description",
+    "required",
+    "items",
+    "additionalProperties",
+]);
+function schemaProperties(schema) {
+    const root = (schema?.schema ?? schema ?? {});
+    if (root.properties && typeof root.properties === "object") {
+        return root.properties;
+    }
+    // flat shape from query inference: { fieldName: "type", ... }
+    const flat = {};
+    for (const [key, value] of Object.entries(root)) {
+        if (JSON_SCHEMA_KEYWORDS.has(key))
+            continue;
+        if (typeof value === "string")
+            flat[key] = value;
+        else if (value && typeof value === "object" && typeof value.type === "string") {
+            flat[key] = value;
+        }
+    }
+    return flat;
+}
+/**
+ * A query-backed dataset has a complete deterministic description: the
+ * dataset is the image of a known query over a known domain. No model is
+ * involved, so here the formal definition and its predicates derive
+ * mechanically from the query, the inferred schema and the row count — and
+ * those predicates DO carry arithmetic evidence (the special case where the
+ * formal claims happen to be fully mechanical).
+ */
+export function inferQueryNotation(params) {
+    const sources = params.entityNames.length ? params.entityNames : ["Domain"];
+    const sourceSymbols = sources.map((name) => ({
+        name,
+        latex: latexIdentifier(name),
+        kind: "set",
+        description: `Entidad de origen ${name}`,
+    }));
+    const union = sources.map((name) => latexIdentifier(name)).join(" \\cup ");
+    const latex = `\\mathcal{D} = \\left\\{\\, r \\;\\middle|\\; r \\in Q\\!\\left(${union}\\right) \\right\\}`;
+    const properties = schemaProperties(params.schema);
+    const predicates = [
+        {
+            id: "cardinality",
+            description: `El dataset tiene exactamente ${params.rowCount} filas`,
+            latex: `|\\mathcal{D}| = ${params.rowCount}`,
+            check: { kind: "row_count", op: "=", value: params.rowCount },
+        },
+    ];
+    for (const [field, raw] of Object.entries(properties)) {
+        const type = typeof raw === "string" ? raw : String(raw?.type ?? "");
+        if (!["number", "integer", "boolean", "string"].includes(type))
+            continue;
+        predicates.push({
+            id: `type_${field}`,
+            description: `Toda fila tiene ${field} de tipo ${type} (o nulo)`,
+            latex: `\\forall r \\in \\mathcal{D}:\\; r.${latexIdentifier(field)} \\in ${latexFieldType(type)} \\cup \\{\\varnothing\\}`,
+            check: {
+                kind: "field_type",
+                field,
+                type: type,
+                allowNull: true,
+            },
+        });
+    }
+    return reviseDatasetNotation(null, {
+        latex,
+        symbols: [
+            {
+                name: "D",
+                latex: "\\mathcal{D}",
+                kind: "set",
+                description: params.explanation?.trim() || "Dataset materializado",
+            },
+            {
+                name: "Q",
+                kind: "function",
+                description: "Consulta InstaQL aplicada al dominio",
+            },
+            ...sourceSymbols,
+        ],
+        predicates,
+        reason: "Notación determinística derivada de la consulta al dominio",
+        final: true,
+    });
+}
+/* ── arithmetic evaluation ──────────────────────────────────────── */
+function readPath(row, path) {
+    let current = row;
+    for (const segment of String(path).split(".")) {
+        if (current === null || current === undefined)
+            return undefined;
+        current = current[segment];
+    }
+    return current;
+}
+function compare(op, left, right, tolerance = 0) {
+    switch (op) {
+        case "=":
+            return Math.abs(left - right) <= tolerance;
+        case "!=":
+            return Math.abs(left - right) > tolerance;
+        case "<":
+            return left < right;
+        case "<=":
+            return left <= right + tolerance;
+        case ">":
+            return left > right;
+        case ">=":
+            return left >= right - tolerance;
+    }
+}
+export function evaluateNotationCheck(rows, check) {
+    switch (check.kind) {
+        case "row_count": {
+            const ok = compare(check.op, rows.length, check.value);
+            return { ok, detail: `|D| = ${rows.length} ${check.op} ${check.value}` };
+        }
+        case "field_type": {
+            let failures = 0;
+            let firstFailure = "";
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined) {
+                    if (check.allowNull)
+                        continue;
+                    failures += 1;
+                    if (!firstFailure)
+                        firstFailure = "null";
+                    continue;
+                }
+                const okValue = check.type === "number"
+                    ? typeof value === "number" && Number.isFinite(value)
+                    : check.type === "integer"
+                        ? typeof value === "number" && Number.isInteger(value)
+                        : check.type === "boolean"
+                            ? typeof value === "boolean"
+                            : typeof value === "string";
+                if (!okValue) {
+                    failures += 1;
+                    if (!firstFailure)
+                        firstFailure = JSON.stringify(value)?.slice(0, 40) ?? "?";
+                }
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} : ${check.type}`
+                    : `${failures}/${rows.length} filas violan ${check.field} : ${check.type} (ej: ${firstFailure})`,
+            };
+        }
+        case "field_range": {
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (typeof value !== "number" || Number.isNaN(value))
+                    continue;
+                if (check.min !== undefined && value < check.min)
+                    failures += 1;
+                else if (check.max !== undefined && value > check.max)
+                    failures += 1;
+            }
+            const bounds = [
+                check.min !== undefined ? `≥ ${check.min}` : "",
+                check.max !== undefined ? `≤ ${check.max}` : "",
+            ]
+                .filter(Boolean)
+                .join(" ∧ ");
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ${bounds}`
+                    : `${failures}/${rows.length} filas fuera de rango en ${check.field}`,
+            };
+        }
+        case "field_in": {
+            const allowed = new Set(check.values.map((value) => JSON.stringify(value)));
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined)
+                    continue;
+                if (!allowed.has(JSON.stringify(value)))
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ∈ {${check.values.join(", ")}}`
+                    : `${failures}/${rows.length} filas con ${check.field} fuera del conjunto`,
+            };
+        }
+        case "field_nonnull": {
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined || value === "")
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ≠ ∅`
+                    : `${failures}/${rows.length} filas con ${check.field} vacío`,
+            };
+        }
+        case "field_matches": {
+            let regex;
+            try {
+                regex = new RegExp(check.pattern);
+            }
+            catch {
+                return { ok: false, detail: `patrón inválido: ${check.pattern}` };
+            }
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (typeof value !== "string")
+                    continue;
+                if (!regex.test(value))
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ~ /${check.pattern}/`
+                    : `${failures}/${rows.length} filas no matchean /${check.pattern}/`,
+            };
+        }
+        case "unique": {
+            const seen = new Set();
+            let duplicates = 0;
+            for (const row of rows) {
+                const key = JSON.stringify(check.fields.map((field) => readPath(row, field)));
+                if (seen.has(key))
+                    duplicates += 1;
+                else
+                    seen.add(key);
+            }
+            return {
+                ok: duplicates === 0,
+                detail: duplicates === 0
+                    ? `(${check.fields.join(", ")}) es clave`
+                    : `${duplicates} duplicados sobre (${check.fields.join(", ")})`,
+            };
+        }
+        case "aggregate": {
+            const values = [];
+            for (const row of rows) {
+                if (check.fn === "count" && !check.field)
+                    continue;
+                const value = readPath(row, String(check.field));
+                if (typeof value === "number" && Number.isFinite(value))
+                    values.push(value);
+            }
+            let actual;
+            switch (check.fn) {
+                case "count":
+                    actual = check.field ? values.length : rows.length;
+                    break;
+                case "sum":
+                    actual = values.reduce((total, value) => total + value, 0);
+                    break;
+                case "min":
+                    actual = values.length ? Math.min(...values) : Number.NaN;
+                    break;
+                case "max":
+                    actual = values.length ? Math.max(...values) : Number.NaN;
+                    break;
+                case "avg":
+                    actual = values.length
+                        ? values.reduce((total, value) => total + value, 0) / values.length
+                        : Number.NaN;
+                    break;
+            }
+            const tolerance = check.tolerance ?? 1e-9;
+            const ok = Number.isFinite(actual) && compare(check.op, actual, check.value, tolerance);
+            return {
+                ok,
+                detail: `${check.fn}(${check.field ?? "*"}) = ${Number.isFinite(actual) ? actual : "∅"} ${check.op} ${check.value}`,
+            };
+        }
+        case "and": {
+            const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
+            return {
+                ok: results.every((result) => result.ok),
+                detail: results.map((result) => result.detail).join(" ∧ "),
+            };
+        }
+        case "or": {
+            const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
+            return {
+                ok: results.some((result) => result.ok),
+                detail: results.map((result) => result.detail).join(" ∨ "),
+            };
+        }
+        case "not": {
+            const result = evaluateNotationCheck(rows, check.check);
+            return { ok: !result.ok, detail: `¬(${result.detail})` };
+        }
+        case "implies": {
+            const antecedent = evaluateNotationCheck(rows, check.if);
+            if (!antecedent.ok) {
+                return { ok: true, detail: `(${antecedent.detail}) → ⊤ (antecedente falso)` };
+            }
+            const consequent = evaluateNotationCheck(rows, check.then);
+            return {
+                ok: consequent.ok,
+                detail: `(${antecedent.detail}) → (${consequent.detail})`,
+            };
+        }
+    }
+}
+/**
+ * Annotate a notation with ADVISORY arithmetic evidence over the produced
+ * rows. Never throws, never blocks, and never changes the notation's
+ * lifecycle status — the dataset's validity is trusted, not proven here.
+ *
+ * Each predicate is reported as:
+ * - "asserted"     no arithmetic form (formal/semantic claim, trusted)
+ * - "supported"    arithmetic evidence agrees
+ * - "contradicted" arithmetic evidence disagrees (a hint to look, not a
+ *                  verdict that the dataset is wrong)
+ * A check that can't be evaluated stays "asserted" — we don't downgrade a
+ * trusted claim because of a malformed mechanical form.
+ */
+export function annotateNotationEvidence(notation, rows) {
+    const checks = [];
+    for (const predicate of notation.predicates ?? []) {
+        if (!predicate.check) {
+            checks.push({ predicateId: predicate.id, status: "asserted" });
+            continue;
+        }
+        try {
+            const outcome = evaluateNotationCheck(rows, predicate.check);
+            checks.push({
+                predicateId: predicate.id,
+                status: outcome.ok ? "supported" : "contradicted",
+                detail: outcome.detail,
+            });
+        }
+        catch (error) {
+            checks.push({
+                predicateId: predicate.id,
+                status: "asserted",
+                detail: `no evaluable: ${String(error).slice(0, 120)}`,
+            });
+        }
+    }
+    return {
+        ...notation,
+        checks,
+        evidenceAt: Date.now(),
+    };
+}

package/dist/query/queryDomain.step.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { DatasetService } from "../service.js";
 import { createDatasetId } from "../id.js";
+import { annotateNotationEvidence, inferQueryNotation } from "../notation.js";
 function normalizeRows(result) {
     if (!result || typeof result !== "object")
         return [];
@@ -61,6 +62,15 @@ export async function queryDomainStep(params) {
     const rows = normalizeRows(queryResult);
     const previewRows = rows.slice(0, 20);
     const schema = inferSchema(rows);
+    // query-backed datasets carry a fully deterministic formal notation:
+    // the set definition, its symbols and its predicates derive mechanically
+    // from the query + rows, so their arithmetic evidence is immediate
+    const notation = annotateNotationEvidence(inferQueryNotation({
+        entityNames: Object.keys(params.query ?? {}),
+        rowCount: rows.length,
+        schema,
+        explanation: params.explanation,
+    }), rows);
     const createRes = await service.createDataset({
         id: datasetId,
         title: params.title ?? "domain.query",
@@ -68,6 +78,7 @@ export async function queryDomainStep(params) {
         instructions: params.explanation,
         analysis: { explanation: params.explanation, query: params.query },
         schema,
+        notation,
         createdAt: Date.now(),
         updatedAt: Date.now(),
     });

package/dist/schema.d.ts CHANGED Viewed

@@ -11,6 +11,8 @@ declare const entities: {
         instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
         analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
         schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
+        /** formal notation (LaTeX + checkable predicates) describing the set */
+        notation: import("@instantdb/core").DataAttrDef<any, false, false, false>;
         calculatedTotalRows: import("@instantdb/core").DataAttrDef<number, false, false, false>;
         actualGeneratedRowCount: import("@instantdb/core").DataAttrDef<number, false, false, false>;
     }, {}, void>;

package/dist/schema.js CHANGED Viewed

@@ -13,6 +13,8 @@ const entities = {
         instructions: i.string().optional(),
         analysis: i.json().optional(),
         schema: i.json().optional(),
+        /** formal notation (LaTeX + checkable predicates) describing the set */
+        notation: i.json().optional(),
         calculatedTotalRows: i.number().optional(),
         actualGeneratedRowCount: i.number().optional(),
     }),

package/dist/service.d.ts CHANGED Viewed

@@ -47,6 +47,10 @@ export declare class DatasetService {
         schema: any;
         status?: string;
     }): Promise<ServiceResult<void>>;
+    updateDatasetNotation(params: {
+        datasetId: string;
+        notation: Record<string, any>;
+    }): Promise<ServiceResult<void>>;
     updateDatasetStatus(params: {
         datasetId: string;
         status: string;

package/dist/service.js CHANGED Viewed

@@ -214,6 +214,24 @@ export class DatasetService {
             return { ok: false, error: message };
         }
     }
+    async updateDatasetNotation(params) {
+        try {
+            const resolved = await this.resolveDatasetEntityId(params.datasetId);
+            if (!resolved.ok)
+                return resolved;
+            await this.db.transact([
+                this.db.tx.dataset_datasets[resolved.data].update({
+                    notation: params.notation,
+                    updatedAt: Date.now(),
+                })
+            ]);
+            return { ok: true, data: undefined };
+        }
+        catch (error) {
+            const message = error instanceof Error ? error.message : String(error);
+            return { ok: false, error: message };
+        }
+    }
     async updateDatasetStatus(params) {
         try {
             const resolved = await this.resolveDatasetEntityId(params.datasetId);

package/dist/transform/prompts.js CHANGED Viewed

@@ -102,11 +102,15 @@ function buildInstructions(context) {
         .ele("Action").txt(`Review ContextResources and any InputPreviews to understand current record structures, evidence, fields, shapes and edge cases. ${multipleInputsNote}`).up()
         .ele("Note").txt("ContextResources DescriptorJson may include inline text, metadata, previewRows, or other visible evidence. Treat that visible content as already available context. Do not use executeCommand only to reread it.").up()
         .up()
-        .ele("Step", { number: "2", name: "Plan Mapping" })
+        .ele("Step", { number: "2", name: "Define the Output Dataset (PLAN FIRST)" })
+        .ele("Action").txt("Call defineNotation with the formal definition of the OUTPUT dataset as a set derived from the input sets: e.g. D = \\pi_{fields}(\\sigma_{condition}(A \\bowtie B)) or set-builder with quantifiers, in LaTeX. Declare the input sets, bound variables and the predicates the output set satisfies.").up()
+        .ele("Note").txt("The definition and the materialization (the transform code + output rows) are TWO CO-EQUAL FACES of the dataset; author the definition FIRST as the PLAN: it states which sets you draw from, how they combine (join, filter, project, aggregate) and which invariants the output keeps (e.g. totals preserved). The definition is a formal proposition we trust — predicates may be semantic. Only for purely arithmetic invariants you MAY add a checkJson for optional advisory evidence. REFINE the definition whenever inspection of the inputs reveals new sets, variables or corrections, and call defineNotation with final=true just before completing — as the RESULT it describes the produced output; any arithmetic predicates then get advisory evidence (never a verdict).").up()
+        .up()
+        .ele("Step", { number: "3", name: "Plan Mapping" })
         .ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
         .ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
         .up()
-        .ele("Step", { number: "3", name: "Transform" })
+        .ele("Step", { number: "4", name: "Transform" })
         .ele("Action").txt("For single-object output, use completeObject with the final object. For row output, use replaceRows with the final rows. Use executeCommand only when command execution is necessary, not merely convenient.").up()
         .ele("Requirement").txt("Do not call completeObject until you have constructed the complete data object. completeObject requires data; a summary-only call is invalid and wastes a model iteration.").up()
         .ele("Requirement").txt("Command execution is necessary only when the final output cannot be produced directly from the provided context, resource descriptors, or previews, and requires running code to inspect, parse, aggregate, join, or compute over files/resources.").up()
@@ -120,12 +124,13 @@ function buildInstructions(context) {
         .ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
         .ele("Requirement").txt("Do not install packages, download dependencies, or access the network from executeCommand. Use only the available runtime and standard library unless a dependency is already present.").up()
         .up()
-        .ele("Step", { number: "4", name: "Validate and Complete" })
-        .ele("Action").txt("When using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
+        .ele("Step", { number: "5", name: "Validate and Complete" })
+        .ele("Action").txt("Call defineNotation with final=true (the definition as RESULT, matching the produced output), then: when using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
         .ele("Behavior").txt("If any completion tool returns success:false, inspect validation details, repair the output, and call the appropriate completion tool again. Do not stop until a completion tool returns success:true.").up()
         .up()
         .up()
         .ele("Rules")
+        .ele("Rule").txt("The formal definition (defineNotation) and the materialization (transform code + output rows) are co-equal faces of the dataset: author the definition first as the PLAN, refine it on every discovery, finalize it as the RESULT before completing.").up()
         .ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
         .ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
         .ele("Rule").txt("Use the cheapest correct tool. completeObject and replaceRows are low-cost completion tools. executeCommand is a high-cost computation tool and requires an explicit commandDescription.").up()

package/dist/transform/transform-dataset.agent.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
 import { datasetUpdateSchemaStep } from "../dataset/steps.js";
 import { getDatasetOutputPath } from "../datasetFiles.js";
 import { createExecuteCommandTool } from "../executeCommand.tool.js";
+import { createDefineNotationTool } from "../defineNotation.tool.js";
 import { createCompleteObjectTool, createReplaceRowsTool, } from "../writeDatasetRows.tool.js";
 import { buildTransformDatasetPromptStep, } from "./transform-dataset.steps.js";
 import { createDatasetId } from "../id.js";
@@ -136,6 +137,10 @@ function createTransformDatasetContextDefinition(params) {
                 sandboxId,
                 runtime,
             }),
+            defineNotation: createDefineNotationTool({
+                datasetId,
+                runtime,
+            }),
         };
     })
         .shouldContinue(({ reactionEvent }) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ekairos/dataset",
-  "version": "1.22.37",
+  "version": "1.22.38",
   "description": "Pulzar Dataset Tools",
   "type": "module",
   "main": "dist/index.js",
@@ -65,9 +65,9 @@
     "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
   },
   "dependencies": {
-    "@ekairos/domain": "^1.22.37",
-    "@ekairos/events": "^1.22.37",
-    "@ekairos/sandbox": "^1.22.37",
+    "@ekairos/domain": "^1.22.38",
+    "@ekairos/events": "^1.22.38",
+    "@ekairos/sandbox": "^1.22.38",
     "@instantdb/admin": "0.22.158",
     "@instantdb/core": "0.22.142",
     "ai": "^5.0.44",