npm - goldenmatch - Versions diffs - 0.1.0 - Mend

goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

package/README.md +140 -0
package/dist/cli.cjs +6079 -0
package/dist/cli.cjs.map +1 -0
package/dist/cli.d.cts +1 -0
package/dist/cli.d.ts +1 -0
package/dist/cli.js +6076 -0
package/dist/cli.js.map +1 -0
package/dist/core/index.cjs +8449 -0
package/dist/core/index.cjs.map +1 -0
package/dist/core/index.d.cts +1972 -0
package/dist/core/index.d.ts +1972 -0
package/dist/core/index.js +8318 -0
package/dist/core/index.js.map +1 -0
package/dist/index.cjs +8449 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +2 -0
package/dist/index.d.ts +2 -0
package/dist/index.js +8318 -0
package/dist/index.js.map +1 -0
package/dist/node/backends/score-worker.cjs +934 -0
package/dist/node/backends/score-worker.cjs.map +1 -0
package/dist/node/backends/score-worker.d.cts +14 -0
package/dist/node/backends/score-worker.d.ts +14 -0
package/dist/node/backends/score-worker.js +932 -0
package/dist/node/backends/score-worker.js.map +1 -0
package/dist/node/index.cjs +11430 -0
package/dist/node/index.cjs.map +1 -0
package/dist/node/index.d.cts +554 -0
package/dist/node/index.d.ts +554 -0
package/dist/node/index.js +11277 -0
package/dist/node/index.js.map +1 -0
package/dist/types-DhUdX5Rc.d.cts +304 -0
package/dist/types-DhUdX5Rc.d.ts +304 -0
package/examples/01-basic-dedupe.ts +60 -0
package/examples/02-match-two-datasets.ts +48 -0
package/examples/03-csv-file-pipeline.ts +62 -0
package/examples/04-string-scoring.ts +63 -0
package/examples/05-custom-config.ts +94 -0
package/examples/06-probabilistic-fs.ts +72 -0
package/examples/07-pprl-privacy.ts +76 -0
package/examples/08-streaming.ts +79 -0
package/examples/09-llm-scorer.ts +79 -0
package/examples/10-explain.ts +60 -0
package/examples/11-evaluate.ts +61 -0
package/examples/README.md +53 -0
package/package.json +66 -0
package/src/cli.ts +372 -0
package/src/core/ann-blocker.ts +593 -0
package/src/core/api.ts +220 -0
package/src/core/autoconfig.ts +363 -0
package/src/core/autofix.ts +102 -0
package/src/core/blocker.ts +655 -0
package/src/core/cluster.ts +699 -0
package/src/core/compare-clusters.ts +176 -0
package/src/core/config/loader.ts +869 -0
package/src/core/cross-encoder.ts +614 -0
package/src/core/data.ts +430 -0
package/src/core/domain.ts +277 -0
package/src/core/embedder.ts +562 -0
package/src/core/evaluate.ts +156 -0
package/src/core/explain.ts +352 -0
package/src/core/golden.ts +524 -0
package/src/core/graph-er.ts +371 -0
package/src/core/index.ts +314 -0
package/src/core/ingest.ts +112 -0
package/src/core/learned-blocking.ts +305 -0
package/src/core/lineage.ts +221 -0
package/src/core/llm/budget.ts +258 -0
package/src/core/llm/cluster.ts +542 -0
package/src/core/llm/scorer.ts +396 -0
package/src/core/match-one.ts +95 -0
package/src/core/matchkey.ts +97 -0
package/src/core/memory/corrections.ts +179 -0
package/src/core/memory/learner.ts +218 -0
package/src/core/memory/store.ts +114 -0
package/src/core/pipeline.ts +366 -0
package/src/core/pprl/protocol.ts +216 -0
package/src/core/probabilistic.ts +511 -0
package/src/core/profiler.ts +212 -0
package/src/core/quality.ts +197 -0
package/src/core/review-queue.ts +177 -0
package/src/core/scorer.ts +855 -0
package/src/core/sensitivity.ts +196 -0
package/src/core/standardize.ts +279 -0
package/src/core/streaming.ts +128 -0
package/src/core/transforms.ts +599 -0
package/src/core/types.ts +570 -0
package/src/core/validate.ts +243 -0
package/src/index.ts +8 -0
package/src/node/a2a/server.ts +470 -0
package/src/node/api/server.ts +412 -0
package/src/node/backends/duckdb.ts +130 -0
package/src/node/backends/score-worker.ts +41 -0
package/src/node/backends/workers.ts +212 -0
package/src/node/config-file.ts +66 -0
package/src/node/connectors/base.ts +57 -0
package/src/node/connectors/bigquery.ts +61 -0
package/src/node/connectors/databricks.ts +69 -0
package/src/node/connectors/file.ts +350 -0
package/src/node/connectors/hubspot.ts +62 -0
package/src/node/connectors/index.ts +43 -0
package/src/node/connectors/salesforce.ts +93 -0
package/src/node/connectors/snowflake.ts +73 -0
package/src/node/db/postgres.ts +173 -0
package/src/node/db/sync.ts +103 -0
package/src/node/dedupe-file.ts +156 -0
package/src/node/index.ts +89 -0
package/src/node/mcp/server.ts +940 -0
package/src/node/tui/app.ts +756 -0
package/src/node/tui/index.ts +6 -0
package/src/node/tui/widgets.ts +128 -0
package/tests/parity/scorer-ground-truth.test.ts +118 -0
package/tests/smoke.test.ts +46 -0
package/tests/unit/a2a-server.test.ts +175 -0
package/tests/unit/ann-blocker.test.ts +117 -0
package/tests/unit/api-server.test.ts +239 -0
package/tests/unit/api.test.ts +77 -0
package/tests/unit/autoconfig.test.ts +103 -0
package/tests/unit/autofix.test.ts +71 -0
package/tests/unit/blocker.test.ts +164 -0
package/tests/unit/buildBlocksAsync.test.ts +63 -0
package/tests/unit/cluster.test.ts +213 -0
package/tests/unit/compare-clusters.test.ts +42 -0
package/tests/unit/config-loader.test.ts +301 -0
package/tests/unit/connectors-base.test.ts +48 -0
package/tests/unit/cross-encoder-model.test.ts +198 -0
package/tests/unit/cross-encoder.test.ts +173 -0
package/tests/unit/db-connectors.test.ts +37 -0
package/tests/unit/domain.test.ts +80 -0
package/tests/unit/embedder.test.ts +151 -0
package/tests/unit/evaluate.test.ts +85 -0
package/tests/unit/explain.test.ts +73 -0
package/tests/unit/golden.test.ts +97 -0
package/tests/unit/graph-er.test.ts +173 -0
package/tests/unit/hnsw-ann.test.ts +283 -0
package/tests/unit/hubspot-connector.test.ts +118 -0
package/tests/unit/ingest.test.ts +97 -0
package/tests/unit/learned-blocking.test.ts +134 -0
package/tests/unit/lineage.test.ts +135 -0
package/tests/unit/match-one.test.ts +129 -0
package/tests/unit/matchkey.test.ts +97 -0
package/tests/unit/mcp-server.test.ts +183 -0
package/tests/unit/memory.test.ts +119 -0
package/tests/unit/pipeline.test.ts +118 -0
package/tests/unit/pprl-protocol.test.ts +381 -0
package/tests/unit/probabilistic.test.ts +494 -0
package/tests/unit/profiler.test.ts +68 -0
package/tests/unit/review-queue.test.ts +68 -0
package/tests/unit/salesforce-connector.test.ts +148 -0
package/tests/unit/scorer.test.ts +301 -0
package/tests/unit/sensitivity.test.ts +154 -0
package/tests/unit/standardize.test.ts +84 -0
package/tests/unit/streaming.test.ts +82 -0
package/tests/unit/transforms.test.ts +208 -0
package/tests/unit/tui-widgets.test.ts +42 -0
package/tests/unit/tui.test.ts +24 -0
package/tests/unit/validate.test.ts +145 -0
package/tests/unit/workers-parallel.test.ts +99 -0
package/tests/unit/workers.test.ts +74 -0
package/tsconfig.json +25 -0
package/tsup.config.ts +37 -0
package/vitest.config.ts +11 -0

package/src/core/explain.ts ADDED Viewed

@@ -0,0 +1,352 @@
+/**
+ * explain.ts — Natural language explanations for match decisions.
+ * Ports `goldenmatch/core/explain.py` (+ parts of `explainer.py`).
+ *
+ * Template-based, zero LLM cost. Produces human-readable summaries of why
+ * two records matched, plus cluster-level summaries.
+ *
+ * Edge-safe: no `node:` imports.
+ */
+import type {
+  Row,
+  MatchkeyConfig,
+  MatchkeyField,
+  ClusterInfo,
+} from "./types.js";
+import { scoreField, asString } from "./scorer.js";
+import { pairKey } from "./cluster.js";
+import { applyTransforms } from "./transforms.js";
+// ---------------------------------------------------------------------------
+// Score descriptors
+// ---------------------------------------------------------------------------
+const SCORE_DESCRIPTORS: ReadonlyArray<readonly [number, string]> = [
+  [0.95, "identical"],
+  [0.85, "very similar"],
+  [0.7, "similar"],
+  [0.5, "somewhat similar"],
+  [0.3, "weakly similar"],
+  [0.0, "different"],
+];
+const SCORER_NAMES: Readonly<Record<string, string>> = {
+  jaro_winkler: "string similarity",
+  levenshtein: "edit distance",
+  token_sort: "token similarity",
+  soundex_match: "phonetic match",
+  exact: "exact match",
+  ensemble: "best-of-multiple",
+  dice: "Dice coefficient",
+  jaccard: "Jaccard similarity",
+  embedding: "semantic similarity",
+  record_embedding: "record similarity",
+};
+function describeScore(score: number): string {
+  for (const [threshold, desc] of SCORE_DESCRIPTORS) {
+    if (score >= threshold) return desc;
+  }
+  return "different";
+}
+function describeScorer(name: string): string {
+  return SCORER_NAMES[name] ?? name;
+}
+// ---------------------------------------------------------------------------
+// Public types
+// ---------------------------------------------------------------------------
+export interface FieldScoreDetail {
+  readonly field: string;
+  readonly scorer: string;
+  readonly valueA: string | null;
+  readonly valueB: string | null;
+  readonly score: number | null;
+  readonly weight: number;
+  readonly diffType: "identical" | "similar" | "different" | "missing" | "unknown";
+}
+export interface PairExplanation {
+  readonly score: number;
+  readonly fieldScores: Readonly<Record<string, number | null>>;
+  readonly explanation: string;
+  readonly confidence: "high" | "medium" | "low";
+  readonly reasoning: readonly string[];
+  readonly details: readonly FieldScoreDetail[];
+}
+export interface ClusterExplanation {
+  readonly clusterId: number;
+  readonly size: number;
+  readonly confidence: number;
+  readonly quality: string;
+  readonly summary: string;
+  readonly strongestField: string | null;
+  readonly weakestLink: readonly [number, number] | null;
+}
+// ---------------------------------------------------------------------------
+// Formatting helpers
+// ---------------------------------------------------------------------------
+function fmtVal(v: string | null): string {
+  if (v === null || v === undefined) return "[null]";
+  const s = String(v).trim();
+  if (s.length > 40) return s.slice(0, 37) + "...";
+  return s;
+}
+function classifyDiff(
+  score: number | null,
+): "identical" | "similar" | "different" | "missing" {
+  if (score === null) return "missing";
+  if (score >= 0.99) return "identical";
+  if (score >= 0.7) return "similar";
+  return "different";
+}
+function confidenceBand(score: number): "high" | "medium" | "low" {
+  if (score >= 0.9) return "high";
+  if (score >= 0.75) return "medium";
+  return "low";
+}
+// ---------------------------------------------------------------------------
+// Per-field scoring (used by both pair and cluster explanation)
+// ---------------------------------------------------------------------------
+function scoreFieldDetail(
+  rowA: Row,
+  rowB: Row,
+  field: MatchkeyField,
+): FieldScoreDetail {
+  const rawA = asString(rowA[field.field]);
+  const rawB = asString(rowB[field.field]);
+  const valA = applyTransforms(rawA, field.transforms);
+  const valB = applyTransforms(rawB, field.transforms);
+  const score = scoreField(valA, valB, field.scorer);
+  return {
+    field: field.field,
+    scorer: field.scorer,
+    valueA: valA,
+    valueB: valB,
+    score,
+    weight: field.weight,
+    diffType: classifyDiff(score),
+  };
+}
+function aggregateScore(details: readonly FieldScoreDetail[]): number {
+  let weightedSum = 0;
+  let weightSum = 0;
+  for (const d of details) {
+    if (d.score === null) continue;
+    weightedSum += d.score * d.weight;
+    weightSum += d.weight;
+  }
+  return weightSum === 0 ? 0 : weightedSum / weightSum;
+}
+// ---------------------------------------------------------------------------
+// Public: explainPair
+// ---------------------------------------------------------------------------
+/**
+ * Produce an NL explanation for why two rows match (or don't), using the
+ * scorers and weights defined by the matchkey config.
+ */
+export function explainPair(
+  rowA: Row,
+  rowB: Row,
+  mk: MatchkeyConfig,
+): PairExplanation {
+  const details = mk.fields.map((f) => scoreFieldDetail(rowA, rowB, f));
+  const overall = aggregateScore(details);
+  // Sort by contribution (score * weight) descending.
+  const sorted = [...details].sort((a, b) => {
+    const aw = (a.score ?? 0) * a.weight;
+    const bw = (b.score ?? 0) * b.weight;
+    return bw - aw;
+  });
+  // Build per-field phrases.
+  const reasoning: string[] = [];
+  let weakest: FieldScoreDetail | null = null;
+  let weakestScore = 1.0;
+  for (const d of sorted) {
+    if (d.score !== null && d.score < weakestScore) {
+      weakestScore = d.score;
+      weakest = d;
+    }
+    const scorerDesc = describeScorer(d.scorer);
+    if (d.diffType === "missing") {
+      reasoning.push(`${d.field} missing on one side`);
+    } else if (d.diffType === "identical" || (d.score ?? 0) >= 0.99) {
+      reasoning.push(`${d.field} match exactly (${fmtVal(d.valueA)})`);
+    } else if ((d.score ?? 0) >= 0.8) {
+      reasoning.push(
+        `${d.field} are ${describeScore(d.score!)} ` +
+          `(${fmtVal(d.valueA)} ~ ${fmtVal(d.valueB)}, ` +
+          `${scorerDesc} ${d.score!.toFixed(2)})`,
+      );
+    } else if ((d.score ?? 0) > 0) {
+      reasoning.push(
+        `${d.field} differ ` +
+          `(${fmtVal(d.valueA)} vs ${fmtVal(d.valueB)}, ` +
+          `${scorerDesc} ${d.score!.toFixed(2)})`,
+      );
+    } else {
+      reasoning.push(
+        `${d.field} do not match ` +
+          `(${fmtVal(d.valueA)} vs ${fmtVal(d.valueB)})`,
+      );
+    }
+  }
+  // Build top-line explanation.
+  const overallDesc = describeScore(overall);
+  const header = `Match (${overallDesc}, score ${overall.toFixed(2)}):`;
+  const body = reasoning.join("; ");
+  const weakestNote =
+    weakest && weakestScore < 0.8 ? ` Weakest signal: ${weakest.field}.` : "";
+  const explanation = `${header} ${body}.${weakestNote}`.replace(/\s+/g, " ").trim();
+  // Field scores map.
+  const fieldScores: Record<string, number | null> = {};
+  for (const d of details) fieldScores[d.field] = d.score;
+  return {
+    score: overall,
+    fieldScores,
+    explanation,
+    confidence: confidenceBand(overall),
+    reasoning,
+    details,
+  };
+}
+// ---------------------------------------------------------------------------
+// Public: explainCluster
+// ---------------------------------------------------------------------------
+/**
+ * Produce a template summary for a cluster: size, confidence, weakest link.
+ * Mirrors `explain_cluster_nl` in Python.
+ */
+export function explainCluster(
+  clusterId: number,
+  cluster: ClusterInfo,
+  rows: readonly Row[],
+  mk: MatchkeyConfig,
+): ClusterExplanation {
+  const size = cluster.size;
+  const confidence = cluster.confidence;
+  const pairScores = cluster.pairScores;
+  if (size <= 1) {
+    return {
+      clusterId,
+      size,
+      confidence,
+      quality: cluster.clusterQuality,
+      summary: "Singleton cluster with 1 record.",
+      strongestField: null,
+      weakestLink: null,
+    };
+  }
+  // Score statistics.
+  const scores: number[] = [];
+  pairScores.forEach((s) => scores.push(s));
+  const minScore = scores.length > 0 ? Math.min(...scores) : 0;
+  const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
+  const avgScore =
+    scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
+  const parts: string[] = [];
+  parts.push(
+    `Cluster of ${size} records ` +
+      `(confidence ${confidence.toFixed(2)}, ` +
+      `scores ${minScore.toFixed(2)}-${maxScore.toFixed(2)}, ` +
+      `avg ${avgScore.toFixed(2)}).`,
+  );
+  if (cluster.bottleneckPair !== null) {
+    const [a, b] = cluster.bottleneckPair;
+    const bpScore = pairScores.get(pairKey(a, b)) ?? 0;
+    parts.push(
+      `Weakest link: records ${a} and ${b} (score ${bpScore.toFixed(2)}).`,
+    );
+  }
+  if (cluster.oversized) {
+    parts.push("WARNING: cluster exceeds max size limit.");
+  }
+  // Identify the strongest field by averaging per-field scores across member pairs.
+  const strongestField = computeStrongestField(cluster, rows, mk);
+  return {
+    clusterId,
+    size,
+    confidence,
+    quality: cluster.clusterQuality,
+    summary: parts.join(" "),
+    strongestField,
+    weakestLink: cluster.bottleneckPair,
+  };
+}
+function computeStrongestField(
+  cluster: ClusterInfo,
+  rows: readonly Row[],
+  mk: MatchkeyConfig,
+): string | null {
+  if (mk.fields.length === 0) return null;
+  const rowById = new Map<number, Row>();
+  for (const r of rows) {
+    const id = r["__row_id__"];
+    if (typeof id === "number") rowById.set(id, r);
+  }
+  const fieldSums: Record<string, { sum: number; count: number }> = {};
+  for (const f of mk.fields) {
+    fieldSums[f.field] = { sum: 0, count: 0 };
+  }
+  // Sample every pair in the cluster.
+  const members = cluster.members;
+  for (let i = 0; i < members.length; i++) {
+    for (let j = i + 1; j < members.length; j++) {
+      const rowA = rowById.get(members[i]!);
+      const rowB = rowById.get(members[j]!);
+      if (!rowA || !rowB) continue;
+      for (const f of mk.fields) {
+        const d = scoreFieldDetail(rowA, rowB, f);
+        if (d.score === null) continue;
+        const entry = fieldSums[f.field]!;
+        entry.sum += d.score;
+        entry.count += 1;
+      }
+    }
+  }
+  let best: string | null = null;
+  let bestAvg = -1;
+  for (const [name, { sum, count }] of Object.entries(fieldSums)) {
+    if (count === 0) continue;
+    const avg = sum / count;
+    if (avg > bestAvg) {
+      bestAvg = avg;
+      best = name;
+    }
+  }
+  return best;
+}