npm - @ekairos/dataset - Versions diffs - 1.22.92-beta.development.0 → 1.22.94-beta.development.0 - Mend

@ekairos/dataset 1.22.92-beta.development.0 → 1.22.94-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/builder/persistence.js +16 -0
package/dist/completeDataset.steps.js +48 -0
package/dist/file/file-dataset.agent.js +5 -0
package/dist/file/prompts.js +18 -5
package/dist/index.d.ts +1 -0
package/dist/index.js +1 -0
package/dist/notation.d.ts +163 -0
package/dist/notation.js +408 -0
package/dist/proposeNotation.tool.d.ts +42 -0
package/dist/proposeNotation.tool.js +142 -0
package/dist/query/queryDomain.step.js +11 -0
package/dist/schema.d.ts +2 -0
package/dist/schema.js +2 -0
package/dist/service.d.ts +4 -0
package/dist/service.js +18 -0
package/dist/transform/prompts.js +9 -4
package/dist/transform/transform-dataset.agent.js +5 -0
package/package.json +4 -4

package/dist/builder/persistence.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { DatasetService } from "../service.js";
 import { datasetDomain } from "../schema.js";
+import { verifyDatasetNotation } from "../notation.js";
 import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
 import { inferDatasetSchema, validateRows } from "./schemaInference.js";
 import { rowsToJsonl } from "./rows.js";
@@ -77,6 +78,21 @@ export async function materializeRowsToDataset(runtime, params) {
     if (!statusResult.ok) {
         throw new Error(statusResult.error);
     }
+    // verify the latest formal notation (if any was proposed) against the
+    // materialized rows — informative only, never blocks the build
+    try {
+        const existing = await service.getDatasetById(params.datasetId);
+        const notation = (existing.ok ? existing.data?.notation : null);
+        if (notation && Array.isArray(notation.predicates) && notation.predicates.length > 0) {
+            await service.updateDatasetNotation({
+                datasetId: params.datasetId,
+                notation: verifyDatasetNotation(notation, params.rows),
+            });
+        }
+    }
+    catch {
+        // notation verification must never affect the build result
+    }
     return params.datasetId;
 }
 export async function uploadInlineTextResource(runtime, datasetId, resource) {

package/dist/completeDataset.steps.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import Ajv from "ajv";
 import { getDatasetOutputPath } from "./datasetFiles.js";
+import { verifyDatasetNotation } from "./notation.js";
 import { DatasetService } from "./service.js";
 import { getDatasetRuntimeDb } from "./dataset/steps.js";
 import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, } from "./sandbox/steps.js";
@@ -176,6 +177,19 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
     }
     console.log(`[Dataset ${datasetId}] Dataset marked as COMPLETED (${totalValidRows} valid rows)`);
     console.log(`[Dataset ${datasetId}] ========================================`);
+    // Formal-notation verification: arithmetic checks of the latest notation
+    // against the produced rows. Informative only — a failure here never
+    // affects the dataset completion result.
+    try {
+        await verifyNotationAgainstJsonl({
+            service,
+            datasetId,
+            jsonlBase64: fileRead.contentBase64,
+        });
+    }
+    catch (error) {
+        console.error(`[Dataset ${datasetId}] notation verification skipped:`, error instanceof Error ? error.message : String(error));
+    }
     return {
         success: true,
         status: "completed",
@@ -187,6 +201,40 @@ export async function persistDatasetStep({ runtime, datasetId, sandboxId, summar
         dataFileId: uploadResult.data.fileId,
     };
 }
+const NOTATION_VERIFY_MAX_ROWS = 50000;
+async function verifyNotationAgainstJsonl(params) {
+    const existing = await params.service.getDatasetById(params.datasetId);
+    const notation = (existing.ok ? existing.data?.notation : null);
+    if (!notation || !Array.isArray(notation.predicates) || notation.predicates.length === 0) {
+        return;
+    }
+    const rows = [];
+    const content = Buffer.from(params.jsonlBase64, "base64").toString("utf-8");
+    for (const line of content.split("\n")) {
+        const trimmed = line.trim();
+        if (!trimmed)
+            continue;
+        try {
+            const parsed = JSON.parse(trimmed);
+            if (parsed && parsed.type === "row") {
+                rows.push(parsed.data);
+            }
+        }
+        catch {
+            // malformed lines were already handled by schema validation
+        }
+        if (rows.length >= NOTATION_VERIFY_MAX_ROWS)
+            break;
+    }
+    const verified = verifyDatasetNotation(notation, rows);
+    await params.service.updateDatasetNotation({
+        datasetId: params.datasetId,
+        notation: verified,
+    });
+    const failed = (verified.checks ?? []).filter((check) => check.status === "failed");
+    console.log(`[Dataset ${params.datasetId}] notation v${verified.version} ${verified.status}` +
+        (failed.length ? ` (${failed.length} predicados violados)` : ""));
+}
 function resolveExecutionStoragePath(outputPath, datasetId) {
     const normalized = String(outputPath ?? "").replace(/\\/g, "/");
     const marker = "/tmp/ekairos/contexts/";

package/dist/file/file-dataset.agent.js CHANGED Viewed

@@ -3,6 +3,7 @@ import { createClearDatasetTool } from "../clearDataset.tool.js";
 import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
 import { datasetGetByIdStep } from "../dataset/steps.js";
 import { createExecuteCommandTool } from "../executeCommand.tool.js";
+import { createProposeNotationTool } from "../proposeNotation.tool.js";
 import { createGenerateSchemaTool } from "./generateSchema.tool.js";
 import { buildFileDatasetPromptStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
 import { createDatasetId } from "../id.js";
@@ -151,6 +152,10 @@ function createFileParseContextDefinition(params) {
                 sandboxId,
                 runtime,
             }),
+            proposeNotation: createProposeNotationTool({
+                datasetId,
+                runtime,
+            }),
         };
         if (!existingSchema) {
             actions.generateSchema = createGenerateSchemaTool({

package/dist/file/prompts.js CHANGED Viewed

@@ -238,9 +238,21 @@ function buildInstructions(context) {
         .ele("Action").txt("Review the FilePreview section in Context to understand the file structure").up()
         .ele("Note").txt("FilePreview contains: TotalRows (total data rows), Metadata (file properties with JSON output), Head (first N raw file lines), Tail (last N lines if present), Mid (middle sample for large files). Each section shows Description, Script (full Python code), Command, Stdout (raw content), Stderr. This allows you to understand the exact file format.").up()
         .up();
+    xml = xml
+        .ele("Step", { number: "2", name: "Propose Formal Notation (PLAN FIRST)" })
+        .ele("Action").txt("Call proposeNotation with the INITIAL formal definition of the dataset as a set, derived from the file preview: D = { r | r ∈ File ∧ <constraints> } in LaTeX, the symbols it binds (sets, variables, functions) and the predicates every row will satisfy").up()
+        .ele("Requirements")
+        .ele("Requirement").txt("The notation is your PLANNING artifact: it comes BEFORE the schema and BEFORE any parsing code. The LaTeX that explains the dataset matters more than the code that produces it").up()
+        .ele("Requirement").txt("Use set-builder notation, quantifiers and arithmetic in LaTeX (e.g. D = \\{(c, q, p) \\mid q \\in \\mathbb{Z}^{+},\\; p \\in \\mathbb{R}_{\\geq 0}\\})").up()
+        .ele("Requirement").txt("Declare every discovered set and variable as a symbol with a one-line meaning").up()
+        .ele("Requirement").txt("Give predicates a machine-checkable checkJson whenever the claim is arithmetic (row counts, field types, ranges, uniqueness, aggregates); leave semantic-only claims without checkJson").up()
+        .ele("Requirement").txt("ITERATE: every time the analysis discovers a new set, variable, constraint or correction (new columns, unexpected types, excluded sections), call proposeNotation again with the refined notation and the reason. The notation is not definitive — discovery is the point").up()
+        .ele("Requirement").txt("Before calling completeDataset, call proposeNotation one last time with final=true so the notation describes EXACTLY the dataset you produced; its checkable predicates will be verified arithmetically against the rows").up()
+        .up()
+        .up();
     if (hasProvidedSchema) {
         xml = xml
-            .ele("Step", { number: "2", name: "Use Provided Schema" })
+            .ele("Step", { number: "3", name: "Use Provided Schema" })
             .ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
             .ele("Requirements")
             .ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
@@ -255,7 +267,7 @@ function buildInstructions(context) {
     }
     else {
         xml = xml
-            .ele("Step", { number: "2", name: "Generate JSON Schema" })
+            .ele("Step", { number: "3", name: "Generate JSON Schema" })
             .ele("Action").txt("Call generateSchema to create a JSON Schema for a SINGLE DATA RECORD (one row of data)").up()
             .ele("Requirements")
             .ele("Requirement").txt("Schema describes ONE DATA RECORD structure only (type: object, not array)").up()
@@ -267,7 +279,7 @@ function buildInstructions(context) {
             .up();
     }
     xml = xml
-        .ele("Step", { number: "3", name: "Generate Dataset JSONL" })
+        .ele("Step", { number: "4", name: "Generate Dataset JSONL" })
         .ele("Action").txt(`Use executeCommand to parse the file and generate output.jsonl in the dataset workstation`).up()
         .ele("Requirements")
         .ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
@@ -279,12 +291,13 @@ function buildInstructions(context) {
         .ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
         .up()
         .up()
-        .ele("Step", { number: "4", name: "Complete and Validate" })
-        .ele("Action").txt("Call completeDataset to validate the dataset").up()
+        .ele("Step", { number: "5", name: "Complete and Validate" })
+        .ele("Action").txt("Call proposeNotation with final=true (refined to match the produced rows), then call completeDataset to validate the dataset").up()
         .ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
         .up()
         .up()
         .ele("Rules")
+        .ele("Rule").txt("The formal notation (proposeNotation) is the planning artifact: propose it first, iterate it on every discovery, finalize it before completion. The LaTeX explains the dataset; the code merely produces it").up()
         .ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
         .ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
         .ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()

package/dist/index.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./dataset.js";
 export * from "./contextWorkspace.js";
 export * from "./domain.js";
 export * from "./materializeDataset.tool.js";
+export * from "./notation.js";
 export * from "./schema.js";
 export * from "./service.js";
 export { registerFileParseContext } from "./file/file-dataset.agent.js";

package/dist/index.js CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./dataset.js";
 export * from "./contextWorkspace.js";
 export * from "./domain.js";
 export * from "./materializeDataset.tool.js";
+export * from "./notation.js";
 export * from "./schema.js";
 export * from "./service.js";
 export { registerFileParseContext } from "./file/file-dataset.agent.js";

package/dist/notation.d.ts ADDED Viewed

@@ -0,0 +1,163 @@
+/**
+ * Formal notation for datasets.
+ *
+ * A dataset is the materialization of a set defined by formal notation:
+ * LaTeX (set-builder, relational algebra, quantified predicates) that
+ * EXPLAINS the data — what sets it draws from, what variables it binds,
+ * what constraints every row satisfies. The notation is the planning
+ * artifact: it starts as a proposal from the first look at the resources
+ * and is ITERATED as the analysis discovers new sets, variables and
+ * invariants. The final notation describes the produced dataset and its
+ * machine-checkable predicates are verified with plain arithmetic over
+ * the actual rows (propositional combinations supported).
+ *
+ * Verification is informative, never blocking: a dataset completes the
+ * same way it always did; the notation carries its own verified/violated
+ * state alongside.
+ */
+export type DatasetNotationSymbolKind = "set" | "variable" | "function" | "constant" | "predicate";
+export type DatasetNotationSymbol = {
+    /** plain identifier, e.g. "D", "w", "Orders" */
+    name: string;
+    /** LaTeX for the symbol, e.g. "\\mathcal{D}" (defaults to the name) */
+    latex?: string;
+    kind: DatasetNotationSymbolKind;
+    description: string;
+};
+export type NotationCmpOp = "=" | "!=" | "<" | "<=" | ">" | ">=";
+/**
+ * Machine-checkable claims about the dataset, evaluated with plain
+ * arithmetic over the rows. Field access supports dot-paths into nested
+ * records ("company.taxId"). Leaf checks are dataset-level propositions;
+ * and/or/not/implies compose them propositionally.
+ */
+export type NotationCheck = {
+    kind: "row_count";
+    op: NotationCmpOp;
+    value: number;
+} | {
+    kind: "field_type";
+    field: string;
+    type: "number" | "integer" | "string" | "boolean";
+    allowNull?: boolean;
+} | {
+    kind: "field_range";
+    field: string;
+    min?: number;
+    max?: number;
+} | {
+    kind: "field_in";
+    field: string;
+    values: Array<string | number | boolean>;
+} | {
+    kind: "field_nonnull";
+    field: string;
+} | {
+    kind: "field_matches";
+    field: string;
+    pattern: string;
+} | {
+    kind: "unique";
+    fields: string[];
+} | {
+    kind: "aggregate";
+    fn: "sum" | "count" | "min" | "max" | "avg";
+    /** omit for fn = "count" */
+    field?: string;
+    op: NotationCmpOp;
+    value: number;
+    /** absolute tolerance for float comparison (default 1e-9) */
+    tolerance?: number;
+} | {
+    kind: "and";
+    checks: NotationCheck[];
+} | {
+    kind: "or";
+    checks: NotationCheck[];
+} | {
+    kind: "not";
+    check: NotationCheck;
+} | {
+    kind: "implies";
+    if: NotationCheck;
+    then: NotationCheck;
+};
+export type DatasetNotationPredicate = {
+    /** stable id within the notation, e.g. "p1", "rowCount" */
+    id: string;
+    description: string;
+    /** the claim in LaTeX, e.g. "\\forall r \\in D:\\; r.amount > 0" */
+    latex: string;
+    /** machine-checkable form; absent = semantic-only claim (not verified) */
+    check?: NotationCheck;
+};
+export type DatasetNotationCheckResult = {
+    predicateId: string;
+    status: "passed" | "failed" | "skipped";
+    detail?: string;
+};
+export type DatasetNotationRevision = {
+    version: number;
+    latex: string;
+    /** why this revision happened — the discovery that triggered it */
+    reason: string;
+    at: number;
+};
+export type DatasetNotationStatus = "proposed" | "refined" | "final" | "verified" | "violated";
+export type DatasetNotation = {
+    version: number;
+    status: DatasetNotationStatus;
+    /** the main definition: the dataset as a set, in LaTeX */
+    latex: string;
+    symbols: DatasetNotationSymbol[];
+    predicates: DatasetNotationPredicate[];
+    checks?: DatasetNotationCheckResult[];
+    verifiedAt?: number;
+    history: DatasetNotationRevision[];
+};
+export type NotationRevisionInput = {
+    latex: string;
+    symbols?: DatasetNotationSymbol[];
+    predicates?: DatasetNotationPredicate[];
+    reason: string;
+    /** "final" marks the notation as describing the produced dataset */
+    final?: boolean;
+};
+/**
+ * Iterate the notation: every revision bumps the version and appends to
+ * history, so the discovery trail (sets/variables found along the way)
+ * stays visible.
+ */
+export declare function reviseDatasetNotation(previous: DatasetNotation | null | undefined, input: NotationRevisionInput): DatasetNotation;
+/** escape an identifier for use inside \text{} */
+export declare function latexIdentifier(name: string): string;
+type JsonSchemaLike = {
+    title?: string;
+    schema?: Record<string, any>;
+    properties?: Record<string, any>;
+};
+/**
+ * A query-backed dataset has a complete deterministic description: the
+ * dataset is the image of a known query over a known domain. No model is
+ * involved — the notation and its checkable predicates derive mechanically
+ * from the query, the inferred schema and the produced row count.
+ */
+export declare function inferQueryNotation(params: {
+    entityNames: string[];
+    rowCount: number;
+    schema?: JsonSchemaLike | null;
+    explanation?: string;
+}): DatasetNotation;
+type CheckOutcome = {
+    ok: boolean;
+    detail: string;
+};
+export declare function evaluateNotationCheck(rows: any[], check: NotationCheck): CheckOutcome;
+/**
+ * Verify a notation against produced rows. Pure arithmetic — never throws.
+ * Predicates without a machine-checkable form are reported as "skipped"
+ * (they remain semantic claims). Returns the notation with check results
+ * and a verified/violated status.
+ */
+export declare function verifyDatasetNotation(notation: DatasetNotation, rows: any[]): DatasetNotation;
+export {};

package/dist/notation.js ADDED Viewed

@@ -0,0 +1,408 @@
+/**
+ * Formal notation for datasets.
+ *
+ * A dataset is the materialization of a set defined by formal notation:
+ * LaTeX (set-builder, relational algebra, quantified predicates) that
+ * EXPLAINS the data — what sets it draws from, what variables it binds,
+ * what constraints every row satisfies. The notation is the planning
+ * artifact: it starts as a proposal from the first look at the resources
+ * and is ITERATED as the analysis discovers new sets, variables and
+ * invariants. The final notation describes the produced dataset and its
+ * machine-checkable predicates are verified with plain arithmetic over
+ * the actual rows (propositional combinations supported).
+ *
+ * Verification is informative, never blocking: a dataset completes the
+ * same way it always did; the notation carries its own verified/violated
+ * state alongside.
+ */
+/**
+ * Iterate the notation: every revision bumps the version and appends to
+ * history, so the discovery trail (sets/variables found along the way)
+ * stays visible.
+ */
+export function reviseDatasetNotation(previous, input) {
+    const version = (previous?.version ?? 0) + 1;
+    const revision = {
+        version,
+        latex: input.latex,
+        reason: input.reason,
+        at: Date.now(),
+    };
+    return {
+        version,
+        status: input.final ? "final" : previous ? "refined" : "proposed",
+        latex: input.latex,
+        symbols: input.symbols ?? previous?.symbols ?? [],
+        predicates: input.predicates ?? previous?.predicates ?? [],
+        history: [...(previous?.history ?? []), revision],
+    };
+}
+/* ── LaTeX helpers ──────────────────────────────────────────────── */
+/** escape an identifier for use inside \text{} */
+export function latexIdentifier(name) {
+    return `\\text{${String(name).replace(/([#$%&_{}])/g, "\\$1")}}`;
+}
+function latexFieldType(type) {
+    if (type === "number")
+        return "\\mathbb{R}";
+    if (type === "integer")
+        return "\\mathbb{Z}";
+    if (type === "boolean")
+        return "\\{\\top,\\bot\\}";
+    return "\\Sigma^{*}";
+}
+const JSON_SCHEMA_KEYWORDS = new Set([
+    "type",
+    "title",
+    "description",
+    "required",
+    "items",
+    "additionalProperties",
+]);
+function schemaProperties(schema) {
+    const root = (schema?.schema ?? schema ?? {});
+    if (root.properties && typeof root.properties === "object") {
+        return root.properties;
+    }
+    // flat shape from query inference: { fieldName: "type", ... }
+    const flat = {};
+    for (const [key, value] of Object.entries(root)) {
+        if (JSON_SCHEMA_KEYWORDS.has(key))
+            continue;
+        if (typeof value === "string")
+            flat[key] = value;
+        else if (value && typeof value === "object" && typeof value.type === "string") {
+            flat[key] = value;
+        }
+    }
+    return flat;
+}
+/**
+ * A query-backed dataset has a complete deterministic description: the
+ * dataset is the image of a known query over a known domain. No model is
+ * involved — the notation and its checkable predicates derive mechanically
+ * from the query, the inferred schema and the produced row count.
+ */
+export function inferQueryNotation(params) {
+    const sources = params.entityNames.length ? params.entityNames : ["Domain"];
+    const sourceSymbols = sources.map((name) => ({
+        name,
+        latex: latexIdentifier(name),
+        kind: "set",
+        description: `Entidad de origen ${name}`,
+    }));
+    const union = sources.map((name) => latexIdentifier(name)).join(" \\cup ");
+    const latex = `\\mathcal{D} = \\left\\{\\, r \\;\\middle|\\; r \\in Q\\!\\left(${union}\\right) \\right\\}`;
+    const properties = schemaProperties(params.schema);
+    const predicates = [
+        {
+            id: "cardinality",
+            description: `El dataset tiene exactamente ${params.rowCount} filas`,
+            latex: `|\\mathcal{D}| = ${params.rowCount}`,
+            check: { kind: "row_count", op: "=", value: params.rowCount },
+        },
+    ];
+    for (const [field, raw] of Object.entries(properties)) {
+        const type = typeof raw === "string" ? raw : String(raw?.type ?? "");
+        if (!["number", "integer", "boolean", "string"].includes(type))
+            continue;
+        predicates.push({
+            id: `type_${field}`,
+            description: `Toda fila tiene ${field} de tipo ${type} (o nulo)`,
+            latex: `\\forall r \\in \\mathcal{D}:\\; r.${latexIdentifier(field)} \\in ${latexFieldType(type)} \\cup \\{\\varnothing\\}`,
+            check: {
+                kind: "field_type",
+                field,
+                type: type,
+                allowNull: true,
+            },
+        });
+    }
+    return reviseDatasetNotation(null, {
+        latex,
+        symbols: [
+            {
+                name: "D",
+                latex: "\\mathcal{D}",
+                kind: "set",
+                description: params.explanation?.trim() || "Dataset materializado",
+            },
+            {
+                name: "Q",
+                kind: "function",
+                description: "Consulta InstaQL aplicada al dominio",
+            },
+            ...sourceSymbols,
+        ],
+        predicates,
+        reason: "Notación determinística derivada de la consulta al dominio",
+        final: true,
+    });
+}
+/* ── arithmetic evaluation ──────────────────────────────────────── */
+function readPath(row, path) {
+    let current = row;
+    for (const segment of String(path).split(".")) {
+        if (current === null || current === undefined)
+            return undefined;
+        current = current[segment];
+    }
+    return current;
+}
+function compare(op, left, right, tolerance = 0) {
+    switch (op) {
+        case "=":
+            return Math.abs(left - right) <= tolerance;
+        case "!=":
+            return Math.abs(left - right) > tolerance;
+        case "<":
+            return left < right;
+        case "<=":
+            return left <= right + tolerance;
+        case ">":
+            return left > right;
+        case ">=":
+            return left >= right - tolerance;
+    }
+}
+export function evaluateNotationCheck(rows, check) {
+    switch (check.kind) {
+        case "row_count": {
+            const ok = compare(check.op, rows.length, check.value);
+            return { ok, detail: `|D| = ${rows.length} ${check.op} ${check.value}` };
+        }
+        case "field_type": {
+            let failures = 0;
+            let firstFailure = "";
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined) {
+                    if (check.allowNull)
+                        continue;
+                    failures += 1;
+                    if (!firstFailure)
+                        firstFailure = "null";
+                    continue;
+                }
+                const okValue = check.type === "number"
+                    ? typeof value === "number" && Number.isFinite(value)
+                    : check.type === "integer"
+                        ? typeof value === "number" && Number.isInteger(value)
+                        : check.type === "boolean"
+                            ? typeof value === "boolean"
+                            : typeof value === "string";
+                if (!okValue) {
+                    failures += 1;
+                    if (!firstFailure)
+                        firstFailure = JSON.stringify(value)?.slice(0, 40) ?? "?";
+                }
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} : ${check.type}`
+                    : `${failures}/${rows.length} filas violan ${check.field} : ${check.type} (ej: ${firstFailure})`,
+            };
+        }
+        case "field_range": {
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (typeof value !== "number" || Number.isNaN(value))
+                    continue;
+                if (check.min !== undefined && value < check.min)
+                    failures += 1;
+                else if (check.max !== undefined && value > check.max)
+                    failures += 1;
+            }
+            const bounds = [
+                check.min !== undefined ? `≥ ${check.min}` : "",
+                check.max !== undefined ? `≤ ${check.max}` : "",
+            ]
+                .filter(Boolean)
+                .join(" ∧ ");
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ${bounds}`
+                    : `${failures}/${rows.length} filas fuera de rango en ${check.field}`,
+            };
+        }
+        case "field_in": {
+            const allowed = new Set(check.values.map((value) => JSON.stringify(value)));
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined)
+                    continue;
+                if (!allowed.has(JSON.stringify(value)))
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ∈ {${check.values.join(", ")}}`
+                    : `${failures}/${rows.length} filas con ${check.field} fuera del conjunto`,
+            };
+        }
+        case "field_nonnull": {
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (value === null || value === undefined || value === "")
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ≠ ∅`
+                    : `${failures}/${rows.length} filas con ${check.field} vacío`,
+            };
+        }
+        case "field_matches": {
+            let regex;
+            try {
+                regex = new RegExp(check.pattern);
+            }
+            catch {
+                return { ok: false, detail: `patrón inválido: ${check.pattern}` };
+            }
+            let failures = 0;
+            for (const row of rows) {
+                const value = readPath(row, check.field);
+                if (typeof value !== "string")
+                    continue;
+                if (!regex.test(value))
+                    failures += 1;
+            }
+            return {
+                ok: failures === 0,
+                detail: failures === 0
+                    ? `∀r: ${check.field} ~ /${check.pattern}/`
+                    : `${failures}/${rows.length} filas no matchean /${check.pattern}/`,
+            };
+        }
+        case "unique": {
+            const seen = new Set();
+            let duplicates = 0;
+            for (const row of rows) {
+                const key = JSON.stringify(check.fields.map((field) => readPath(row, field)));
+                if (seen.has(key))
+                    duplicates += 1;
+                else
+                    seen.add(key);
+            }
+            return {
+                ok: duplicates === 0,
+                detail: duplicates === 0
+                    ? `(${check.fields.join(", ")}) es clave`
+                    : `${duplicates} duplicados sobre (${check.fields.join(", ")})`,
+            };
+        }
+        case "aggregate": {
+            const values = [];
+            for (const row of rows) {
+                if (check.fn === "count" && !check.field)
+                    continue;
+                const value = readPath(row, String(check.field));
+                if (typeof value === "number" && Number.isFinite(value))
+                    values.push(value);
+            }
+            let actual;
+            switch (check.fn) {
+                case "count":
+                    actual = check.field ? values.length : rows.length;
+                    break;
+                case "sum":
+                    actual = values.reduce((total, value) => total + value, 0);
+                    break;
+                case "min":
+                    actual = values.length ? Math.min(...values) : Number.NaN;
+                    break;
+                case "max":
+                    actual = values.length ? Math.max(...values) : Number.NaN;
+                    break;
+                case "avg":
+                    actual = values.length
+                        ? values.reduce((total, value) => total + value, 0) / values.length
+                        : Number.NaN;
+                    break;
+            }
+            const tolerance = check.tolerance ?? 1e-9;
+            const ok = Number.isFinite(actual) && compare(check.op, actual, check.value, tolerance);
+            return {
+                ok,
+                detail: `${check.fn}(${check.field ?? "*"}) = ${Number.isFinite(actual) ? actual : "∅"} ${check.op} ${check.value}`,
+            };
+        }
+        case "and": {
+            const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
+            return {
+                ok: results.every((result) => result.ok),
+                detail: results.map((result) => result.detail).join(" ∧ "),
+            };
+        }
+        case "or": {
+            const results = check.checks.map((inner) => evaluateNotationCheck(rows, inner));
+            return {
+                ok: results.some((result) => result.ok),
+                detail: results.map((result) => result.detail).join(" ∨ "),
+            };
+        }
+        case "not": {
+            const result = evaluateNotationCheck(rows, check.check);
+            return { ok: !result.ok, detail: `¬(${result.detail})` };
+        }
+        case "implies": {
+            const antecedent = evaluateNotationCheck(rows, check.if);
+            if (!antecedent.ok) {
+                return { ok: true, detail: `(${antecedent.detail}) → ⊤ (antecedente falso)` };
+            }
+            const consequent = evaluateNotationCheck(rows, check.then);
+            return {
+                ok: consequent.ok,
+                detail: `(${antecedent.detail}) → (${consequent.detail})`,
+            };
+        }
+    }
+}
+/**
+ * Verify a notation against produced rows. Pure arithmetic — never throws.
+ * Predicates without a machine-checkable form are reported as "skipped"
+ * (they remain semantic claims). Returns the notation with check results
+ * and a verified/violated status.
+ */
+export function verifyDatasetNotation(notation, rows) {
+    const checks = [];
+    let failed = 0;
+    for (const predicate of notation.predicates ?? []) {
+        if (!predicate.check) {
+            checks.push({ predicateId: predicate.id, status: "skipped" });
+            continue;
+        }
+        try {
+            const outcome = evaluateNotationCheck(rows, predicate.check);
+            checks.push({
+                predicateId: predicate.id,
+                status: outcome.ok ? "passed" : "failed",
+                detail: outcome.detail,
+            });
+            if (!outcome.ok)
+                failed += 1;
+        }
+        catch (error) {
+            checks.push({
+                predicateId: predicate.id,
+                status: "failed",
+                detail: `error de evaluación: ${String(error).slice(0, 120)}`,
+            });
+            failed += 1;
+        }
+    }
+    return {
+        ...notation,
+        checks,
+        status: failed === 0 ? "verified" : "violated",
+        verifiedAt: Date.now(),
+    };
+}

package/dist/proposeNotation.tool.d.ts ADDED Viewed

@@ -0,0 +1,42 @@
+interface ProposeNotationToolParams {
+    datasetId: string;
+    runtime: any;
+}
+/**
+ * proposeNotation — declare or ITERATE the formal notation of the dataset.
+ *
+ * The notation is the planning artifact: call it FIRST with the initial
+ * set definition derived from the resources, and call it AGAIN whenever
+ * the analysis discovers new sets, variables, constraints or corrections.
+ * Every call appends a revision (the discovery trail is preserved). Mark
+ * the last call with final=true so the notation describes the produced
+ * dataset; its checkable predicates get verified arithmetically after
+ * completion.
+ */
+export declare function createProposeNotationTool({ datasetId, runtime }: ProposeNotationToolParams): import("ai").Tool<{
+    latex: string;
+    symbols: {
+        name: string;
+        kind: "function" | "set" | "variable" | "constant" | "predicate";
+        description: string;
+        latex?: string | undefined;
+    }[];
+    predicates: {
+        id: string;
+        description: string;
+        latex: string;
+        checkJson?: string | undefined;
+    }[];
+    reason: string;
+    final?: boolean | undefined;
+}, {
+    success: boolean;
+    error: string;
+} | {
+    warning?: string | undefined;
+    success: boolean;
+    version: number;
+    status: import("./notation.js").DatasetNotationStatus;
+    error?: undefined;
+}>;
+export {};

package/dist/proposeNotation.tool.js ADDED Viewed

@@ -0,0 +1,142 @@
+import { tool } from "ai";
+import { z } from "zod";
+import { DatasetService } from "./service.js";
+import { datasetDomain } from "./schema.js";
+import { reviseDatasetNotation, } from "./notation.js";
+const symbolSchema = z.object({
+    name: z.string().describe("Plain identifier, e.g. 'D', 'Orders', 'w'"),
+    latex: z
+        .string()
+        .optional()
+        .describe("LaTeX for the symbol, e.g. '\\\\mathcal{D}' (defaults to the name)"),
+    kind: z.enum(["set", "variable", "function", "constant", "predicate"]),
+    description: z.string().describe("What this symbol denotes in the data"),
+});
+const predicateSchema = z.object({
+    id: z.string().describe("Stable id, e.g. 'p1', 'cardinality'"),
+    description: z.string().describe("The claim in plain language"),
+    latex: z
+        .string()
+        .describe("The claim in LaTeX, e.g. '\\\\forall r \\\\in D: r.amount > 0'"),
+    checkJson: z
+        .string()
+        .optional()
+        .describe([
+        "OPTIONAL machine-checkable form of the claim as a JSON string, verified",
+        "with plain arithmetic over the produced rows. Shapes:",
+        '{"kind":"row_count","op":"=","value":124}',
+        '{"kind":"field_type","field":"amount","type":"number","allowNull":true}',
+        '{"kind":"field_range","field":"amount","min":0}',
+        '{"kind":"field_in","field":"status","values":["paid","void"]}',
+        '{"kind":"field_nonnull","field":"orderId"}',
+        '{"kind":"field_matches","field":"sku","pattern":"^[A-Z0-9-]+$"}',
+        '{"kind":"unique","fields":["orderId"]}',
+        '{"kind":"aggregate","fn":"sum","field":"amount","op":">=","value":0}',
+        'Propositional composition: {"kind":"and"|"or","checks":[...]},',
+        '{"kind":"not","check":...}, {"kind":"implies","if":...,"then":...}.',
+        "Fields support dot-paths into nested records (company.taxId).",
+        "Omit for claims that are semantic only.",
+    ].join(" ")),
+});
+async function getDatasetService(runtime) {
+    const scoped = await runtime.use(datasetDomain);
+    return new DatasetService(scoped.db);
+}
+/**
+ * proposeNotation — declare or ITERATE the formal notation of the dataset.
+ *
+ * The notation is the planning artifact: call it FIRST with the initial
+ * set definition derived from the resources, and call it AGAIN whenever
+ * the analysis discovers new sets, variables, constraints or corrections.
+ * Every call appends a revision (the discovery trail is preserved). Mark
+ * the last call with final=true so the notation describes the produced
+ * dataset; its checkable predicates get verified arithmetically after
+ * completion.
+ */
+export function createProposeNotationTool({ datasetId, runtime }) {
+    return tool({
+        description: [
+            "Declare or refine the FORMAL NOTATION of the dataset: the dataset as a",
+            "set defined in LaTeX (set-builder, relational algebra, quantified",
+            "predicates) plus the symbols it binds and the predicates every row",
+            "satisfies. This is your PLANNING artifact — propose it before writing",
+            "any code, and revise it whenever the analysis discovers new sets,",
+            "variables or invariants. The latest final notation is verified",
+            "arithmetically against the produced rows (non-blocking).",
+        ].join(" "),
+        inputSchema: z.object({
+            latex: z
+                .string()
+                .describe("Main definition of the dataset as a set, in LaTeX. Example: 'D = \\\\{(w,r,t) \\\\mid t = \\\\sum_{o \\\\in Orders} o.amount,\\\\; o.status = paid\\\\}'"),
+            symbols: z.array(symbolSchema).describe("Symbols bound by the notation"),
+            predicates: z
+                .array(predicateSchema)
+                .describe("Claims about the dataset; include machine-checkable forms when possible"),
+            reason: z
+                .string()
+                .describe("What discovery triggered this revision (or 'initial proposal')"),
+            final: z
+                .boolean()
+                .optional()
+                .describe("true when this notation describes the dataset you are about to complete"),
+        }),
+        execute: async ({ latex, symbols, predicates, reason, final }) => {
+            try {
+                const service = await getDatasetService(runtime);
+                const existing = await service.getDatasetById(datasetId);
+                const previous = (existing.ok ? existing.data?.notation : null);
+                const parsedPredicates = [];
+                const checkErrors = [];
+                for (const predicate of predicates) {
+                    let check;
+                    if (predicate.checkJson) {
+                        try {
+                            check = JSON.parse(predicate.checkJson);
+                            if (!check || typeof check !== "object" || !("kind" in check)) {
+                                throw new Error("check must be an object with a 'kind'");
+                            }
+                        }
+                        catch (error) {
+                            checkErrors.push(`predicate ${predicate.id}: invalid checkJson (${String(error).slice(0, 80)})`);
+                            check = undefined;
+                        }
+                    }
+                    parsedPredicates.push({
+                        id: predicate.id,
+                        description: predicate.description,
+                        latex: predicate.latex,
+                        ...(check ? { check } : {}),
+                    });
+                }
+                const notation = reviseDatasetNotation(previous, {
+                    latex,
+                    symbols: symbols,
+                    predicates: parsedPredicates,
+                    reason,
+                    final,
+                });
+                const update = await service.updateDatasetNotation({ datasetId, notation });
+                if (!update.ok) {
+                    return { success: false, error: update.error };
+                }
+                console.log(`[Dataset ${datasetId}] notation v${notation.version} (${notation.status}): ${reason}`);
+                return {
+                    success: true,
+                    version: notation.version,
+                    status: notation.status,
+                    ...(checkErrors.length
+                        ? {
+                            warning: `some checks were dropped: ${checkErrors.join("; ")}`,
+                        }
+                        : {}),
+                };
+            }
+            catch (error) {
+                return {
+                    success: false,
+                    error: error instanceof Error ? error.message : String(error),
+                };
+            }
+        },
+    });
+}

package/dist/query/queryDomain.step.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { DatasetService } from "../service.js";
 import { createDatasetId } from "../id.js";
+import { inferQueryNotation, verifyDatasetNotation } from "../notation.js";
 function normalizeRows(result) {
     if (!result || typeof result !== "object")
         return [];
@@ -61,6 +62,15 @@ export async function queryDomainStep(params) {
     const rows = normalizeRows(queryResult);
     const previewRows = rows.slice(0, 20);
     const schema = inferSchema(rows);
+    // query-backed datasets carry a fully deterministic formal notation:
+    // the set definition, its symbols and its checkable predicates derive
+    // mechanically from the query + rows; verification is immediate
+    const notation = verifyDatasetNotation(inferQueryNotation({
+        entityNames: Object.keys(params.query ?? {}),
+        rowCount: rows.length,
+        schema,
+        explanation: params.explanation,
+    }), rows);
     const createRes = await service.createDataset({
         id: datasetId,
         title: params.title ?? "domain.query",
@@ -68,6 +78,7 @@ export async function queryDomainStep(params) {
         instructions: params.explanation,
         analysis: { explanation: params.explanation, query: params.query },
         schema,
+        notation,
         createdAt: Date.now(),
         updatedAt: Date.now(),
     });

package/dist/schema.d.ts CHANGED Viewed

@@ -11,6 +11,8 @@ declare const entities: {
         instructions: import("@instantdb/core").DataAttrDef<string, false, false, false>;
         analysis: import("@instantdb/core").DataAttrDef<any, false, false, false>;
         schema: import("@instantdb/core").DataAttrDef<any, false, false, false>;
+        /** formal notation (LaTeX + checkable predicates) describing the set */
+        notation: import("@instantdb/core").DataAttrDef<any, false, false, false>;
         calculatedTotalRows: import("@instantdb/core").DataAttrDef<number, false, false, false>;
         actualGeneratedRowCount: import("@instantdb/core").DataAttrDef<number, false, false, false>;
     }, {}, void>;

package/dist/schema.js CHANGED Viewed

@@ -13,6 +13,8 @@ const entities = {
         instructions: i.string().optional(),
         analysis: i.json().optional(),
         schema: i.json().optional(),
+        /** formal notation (LaTeX + checkable predicates) describing the set */
+        notation: i.json().optional(),
         calculatedTotalRows: i.number().optional(),
         actualGeneratedRowCount: i.number().optional(),
     }),

package/dist/service.d.ts CHANGED Viewed

@@ -47,6 +47,10 @@ export declare class DatasetService {
         schema: any;
         status?: string;
     }): Promise<ServiceResult<void>>;
+    updateDatasetNotation(params: {
+        datasetId: string;
+        notation: Record<string, any>;
+    }): Promise<ServiceResult<void>>;
     updateDatasetStatus(params: {
         datasetId: string;
         status: string;

package/dist/service.js CHANGED Viewed

@@ -214,6 +214,24 @@ export class DatasetService {
             return { ok: false, error: message };
         }
     }
+    async updateDatasetNotation(params) {
+        try {
+            const resolved = await this.resolveDatasetEntityId(params.datasetId);
+            if (!resolved.ok)
+                return resolved;
+            await this.db.transact([
+                this.db.tx.dataset_datasets[resolved.data].update({
+                    notation: params.notation,
+                    updatedAt: Date.now(),
+                })
+            ]);
+            return { ok: true, data: undefined };
+        }
+        catch (error) {
+            const message = error instanceof Error ? error.message : String(error);
+            return { ok: false, error: message };
+        }
+    }
     async updateDatasetStatus(params) {
         try {
             const resolved = await this.resolveDatasetEntityId(params.datasetId);

package/dist/transform/prompts.js CHANGED Viewed

@@ -102,11 +102,15 @@ function buildInstructions(context) {
         .ele("Action").txt(`Review ContextResources and any InputPreviews to understand current record structures, evidence, fields, shapes and edge cases. ${multipleInputsNote}`).up()
         .ele("Note").txt("ContextResources DescriptorJson may include inline text, metadata, previewRows, or other visible evidence. Treat that visible content as already available context. Do not use executeCommand only to reread it.").up()
         .up()
-        .ele("Step", { number: "2", name: "Plan Mapping" })
+        .ele("Step", { number: "2", name: "Propose Formal Notation (PLAN FIRST)" })
+        .ele("Action").txt("Call proposeNotation with the formal definition of the OUTPUT dataset as a set derived from the input sets: e.g. D = \\pi_{fields}(\\sigma_{condition}(A \\bowtie B)) or set-builder with quantifiers, in LaTeX. Declare the input sets, bound variables and the predicates every output row satisfies.").up()
+        .ele("Note").txt("The notation is the planning artifact and comes BEFORE the transformation: it states which sets you draw from, how they combine (join, filter, project, aggregate) and which arithmetic invariants the output keeps (e.g. totals preserved across the transformation). Give predicates a machine-checkable checkJson whenever the claim is arithmetic (row counts, ranges, uniqueness, aggregates). ITERATE the notation whenever inspection of the inputs reveals new sets, variables or corrections, and call proposeNotation with final=true just before completing — it will be verified arithmetically against the produced rows.").up()
+        .up()
+        .ele("Step", { number: "3", name: "Plan Mapping" })
         .ele("Action").txt("Plan a deterministic mapping from input data fields to the output schema fields (normalize names, types, and formats).").up()
         .ele("Note").txt("If fields are missing, set defaults; if types differ, coerce consistently. When working with multiple inputs, decide how to combine or relate them. Output field names must remain exactly as declared by the output schema.").up()
         .up()
-        .ele("Step", { number: "3", name: "Transform" })
+        .ele("Step", { number: "4", name: "Transform" })
         .ele("Action").txt("For single-object output, use completeObject with the final object. For row output, use replaceRows with the final rows. Use executeCommand only when command execution is necessary, not merely convenient.").up()
         .ele("Requirement").txt("Do not call completeObject until you have constructed the complete data object. completeObject requires data; a summary-only call is invalid and wastes a model iteration.").up()
         .ele("Requirement").txt("Command execution is necessary only when the final output cannot be produced directly from the provided context, resource descriptors, or previews, and requires running code to inspect, parse, aggregate, join, or compute over files/resources.").up()
@@ -120,12 +124,13 @@ function buildInstructions(context) {
         .ele("Requirement").txt("Do not print large data to stdout; only progress and summaries.").up()
         .ele("Requirement").txt("Do not install packages, download dependencies, or access the network from executeCommand. Use only the available runtime and standard library unless a dependency is already present.").up()
         .up()
-        .ele("Step", { number: "4", name: "Validate and Complete" })
-        .ele("Action").txt("When using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
+        .ele("Step", { number: "5", name: "Validate and Complete" })
+        .ele("Action").txt("Call proposeNotation with final=true (refined to match the produced output), then: when using completeObject or replaceRows, no separate completeDataset call is needed. When using executeCommand, call completeDataset to validate against the output schema and mark as completed.").up()
         .ele("Behavior").txt("If any completion tool returns success:false, inspect validation details, repair the output, and call the appropriate completion tool again. Do not stop until a completion tool returns success:true.").up()
         .up()
         .up()
         .ele("Rules")
+        .ele("Rule").txt("The formal notation (proposeNotation) is the planning artifact: propose it before transforming, iterate it on every discovery, finalize it before completing. The LaTeX explains the dataset; the code merely produces it.").up()
         .ele("Rule").txt("Output must strictly match the output schema for each record in data.").up()
         .ele("Rule").txt("OutputSchema property names are authoritative. Field names are a technical contract; only field values may preserve input language.").up()
         .ele("Rule").txt("Use the cheapest correct tool. completeObject and replaceRows are low-cost completion tools. executeCommand is a high-cost computation tool and requires an explicit commandDescription.").up()

package/dist/transform/transform-dataset.agent.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFa
 import { datasetUpdateSchemaStep } from "../dataset/steps.js";
 import { getDatasetOutputPath } from "../datasetFiles.js";
 import { createExecuteCommandTool } from "../executeCommand.tool.js";
+import { createProposeNotationTool } from "../proposeNotation.tool.js";
 import { createCompleteObjectTool, createReplaceRowsTool, } from "../writeDatasetRows.tool.js";
 import { buildTransformDatasetPromptStep, } from "./transform-dataset.steps.js";
 import { createDatasetId } from "../id.js";
@@ -136,6 +137,10 @@ function createTransformDatasetContextDefinition(params) {
                 sandboxId,
                 runtime,
             }),
+            proposeNotation: createProposeNotationTool({
+                datasetId,
+                runtime,
+            }),
         };
     })
         .shouldContinue(({ reactionEvent }) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ekairos/dataset",
-  "version": "1.22.92-beta.development.0",
+  "version": "1.22.94-beta.development.0",
   "description": "Pulzar Dataset Tools",
   "type": "module",
   "main": "dist/index.js",
@@ -65,9 +65,9 @@
     "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
   },
   "dependencies": {
-    "@ekairos/domain": "^1.22.92-beta.development.0",
-    "@ekairos/events": "^1.22.92-beta.development.0",
-    "@ekairos/sandbox": "^1.22.92-beta.development.0",
+    "@ekairos/domain": "^1.22.94-beta.development.0",
+    "@ekairos/events": "^1.22.94-beta.development.0",
+    "@ekairos/sandbox": "^1.22.94-beta.development.0",
     "@instantdb/admin": "0.22.158",
     "@instantdb/core": "0.22.142",
     "ai": "^5.0.44",