goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,135 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ buildLineage,
4
+ lineageToJson,
5
+ lineageFromJson,
6
+ } from "../../src/core/lineage.js";
7
+ import { runDedupePipeline } from "../../src/core/pipeline.js";
8
+ import {
9
+ makeConfig,
10
+ makeMatchkeyConfig,
11
+ makeMatchkeyField,
12
+ } from "../../src/core/types.js";
13
+ import type { Row } from "../../src/core/types.js";
14
+
15
+ function buildTinyDedupeResult() {
16
+ const rows: Row[] = [
17
+ { email: "a@x.com", name: "Alice Brown" },
18
+ { email: "a@x.com", name: "Alice B." },
19
+ { email: "b@x.com", name: "Bob Smith" },
20
+ ];
21
+ const mk = makeMatchkeyConfig({
22
+ name: "email_exact",
23
+ type: "exact",
24
+ fields: [
25
+ makeMatchkeyField({
26
+ field: "email",
27
+ transforms: ["lowercase"],
28
+ scorer: "exact",
29
+ }),
30
+ ],
31
+ });
32
+ const config = makeConfig({ matchkeys: [mk] });
33
+ return runDedupePipeline(rows, config);
34
+ }
35
+
36
+ describe("buildLineage", () => {
37
+ it("produces one edge per cluster in the DedupeResult", () => {
38
+ const result = buildTinyDedupeResult();
39
+ const bundle = buildLineage(result);
40
+ // There is at least one multi-member cluster -> at least one edge.
41
+ expect(bundle.edges.length).toBeGreaterThan(0);
42
+ expect(bundle.recordCount).toBe(bundle.edges.length);
43
+ });
44
+
45
+ it("edges carry cluster_id, source_row_ids, golden_row_id, and field provenance", () => {
46
+ const result = buildTinyDedupeResult();
47
+ const bundle = buildLineage(result);
48
+ const edge = bundle.edges[0]!;
49
+ expect(typeof edge.clusterId).toBe("number");
50
+ expect(Array.isArray(edge.sourceRowIds)).toBe(true);
51
+ expect(edge.sourceRowIds.length).toBeGreaterThanOrEqual(2);
52
+ expect(typeof edge.goldenRowId).toBe("number");
53
+ // Field provenance should include non-internal fields like email and name.
54
+ const keys = Object.keys(edge.fieldProvenance);
55
+ expect(keys.length).toBeGreaterThan(0);
56
+ for (const k of keys) {
57
+ const entry = edge.fieldProvenance[k]!;
58
+ expect(typeof entry.sourceRowId).toBe("number");
59
+ expect(typeof entry.strategy).toBe("string");
60
+ expect(typeof entry.confidence).toBe("number");
61
+ }
62
+ });
63
+
64
+ it("does not emit provenance entries for internal __-prefixed keys", () => {
65
+ const result = buildTinyDedupeResult();
66
+ const bundle = buildLineage(result);
67
+ for (const edge of bundle.edges) {
68
+ for (const k of Object.keys(edge.fieldProvenance)) {
69
+ expect(k.startsWith("__")).toBe(false);
70
+ }
71
+ }
72
+ });
73
+
74
+ it("defaultStrategy override propagates into field provenance", () => {
75
+ const result = buildTinyDedupeResult();
76
+ const bundle = buildLineage(result, { defaultStrategy: "first_non_null" });
77
+ const edge = bundle.edges[0];
78
+ if (edge) {
79
+ const anyEntry = Object.values(edge.fieldProvenance)[0];
80
+ if (anyEntry) expect(anyEntry.strategy).toBe("first_non_null");
81
+ }
82
+ });
83
+
84
+ it("does not render naturalLanguage by default", () => {
85
+ const result = buildTinyDedupeResult();
86
+ const bundle = buildLineage(result);
87
+ for (const edge of bundle.edges) {
88
+ expect(edge.naturalLanguage).toBeUndefined();
89
+ }
90
+ });
91
+
92
+ it("renders natural language when naturalLanguage: true", () => {
93
+ const result = buildTinyDedupeResult();
94
+ const bundle = buildLineage(result, { naturalLanguage: true });
95
+ expect(bundle.edges.length).toBeGreaterThan(0);
96
+ const edge = bundle.edges[0]!;
97
+ expect(edge.naturalLanguage).toBeDefined();
98
+ expect(edge.naturalLanguage).toMatch(/Cluster \d+/);
99
+ expect(edge.naturalLanguage).toMatch(/merged \d+ source records/);
100
+ expect(edge.naturalLanguage).toMatch(/golden row -?\d+/);
101
+ // Strongest contribution should mention a real field name — our fixture
102
+ // has `email` and `name` columns, and internal __ keys are filtered out.
103
+ expect(edge.naturalLanguage).toMatch(/Strongest contribution: (email|name)/);
104
+ });
105
+
106
+ it("naturalLanguage reports zero-field edges gracefully", () => {
107
+ // Force an edge through buildLineage where no non-internal fields exist
108
+ // on the golden record would be contrived; instead validate the template
109
+ // shape for the normal path, and ensure the helper doesn't crash when
110
+ // invoked on an edge with an empty provenance map (regression guard).
111
+ const result = buildTinyDedupeResult();
112
+ const bundle = buildLineage(result, { naturalLanguage: true });
113
+ for (const edge of bundle.edges) {
114
+ expect(typeof edge.naturalLanguage).toBe("string");
115
+ expect((edge.naturalLanguage as string).length).toBeGreaterThan(0);
116
+ }
117
+ });
118
+ });
119
+
120
+ describe("lineageToJson / lineageFromJson", () => {
121
+ it("round-trips a lineage bundle", () => {
122
+ const result = buildTinyDedupeResult();
123
+ const original = buildLineage(result);
124
+ const json = lineageToJson(original);
125
+ const parsed = lineageFromJson(json);
126
+ expect(parsed.edges.length).toBe(original.edges.length);
127
+ expect(parsed.recordCount).toBe(original.recordCount);
128
+ expect(parsed.timestamp).toBe(original.timestamp);
129
+ });
130
+
131
+ it("lineageFromJson throws on malformed input", () => {
132
+ expect(() => lineageFromJson("{}")).toThrow(/Invalid lineage bundle/);
133
+ expect(() => lineageFromJson("null")).toThrow(/Invalid lineage bundle/);
134
+ });
135
+ });
@@ -0,0 +1,129 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { matchOne, findExactMatchesOne } from "../../src/core/match-one.js";
3
+ import { makeMatchkeyConfig, makeMatchkeyField } from "../../src/core/types.js";
4
+ import type { Row } from "../../src/core/types.js";
5
+
6
+ function r(rowId: number, name: string, email?: string): Row {
7
+ return { __row_id__: rowId, name, ...(email !== undefined ? { email } : {}) };
8
+ }
9
+
10
+ describe("matchOne", () => {
11
+ it("returns matches sorted by descending score, above threshold", () => {
12
+ const record: Row = { name: "John Smith" };
13
+ const rows: Row[] = [
14
+ r(0, "John Smith"),
15
+ r(1, "Jon Smith"),
16
+ r(2, "Zxqwer Zxqwer"),
17
+ ];
18
+ const mk = makeMatchkeyConfig({
19
+ name: "name",
20
+ type: "weighted",
21
+ threshold: 0.7,
22
+ fields: [
23
+ makeMatchkeyField({
24
+ field: "name",
25
+ scorer: "jaro_winkler",
26
+ transforms: ["lowercase"],
27
+ }),
28
+ ],
29
+ });
30
+ const hits = matchOne(record, rows, mk);
31
+ expect(hits.length).toBeGreaterThanOrEqual(2);
32
+ // Sorted: first hit is the exact match on row 0 (score 1.0).
33
+ expect(hits[0]!.rowId).toBe(0);
34
+ expect(hits[0]!.score).toBeCloseTo(1.0, 5);
35
+ // All hits respect threshold.
36
+ for (const h of hits) expect(h.score).toBeGreaterThanOrEqual(0.7);
37
+ // Very different row not included.
38
+ expect(hits.some((h) => h.rowId === 2)).toBe(false);
39
+ });
40
+
41
+ it("returns empty array on empty dataset", () => {
42
+ const mk = makeMatchkeyConfig({
43
+ name: "n",
44
+ type: "weighted",
45
+ fields: [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })],
46
+ });
47
+ const hits = matchOne({ name: "Alice" }, [], mk);
48
+ expect(hits).toEqual([]);
49
+ });
50
+
51
+ it("returns empty when nothing is above threshold", () => {
52
+ const record: Row = { name: "Alice Brown" };
53
+ const rows: Row[] = [r(0, "Zoltan Xiong"), r(1, "Yuri Nakamura")];
54
+ const mk = makeMatchkeyConfig({
55
+ name: "n",
56
+ type: "weighted",
57
+ threshold: 0.95,
58
+ fields: [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })],
59
+ });
60
+ const hits = matchOne(record, rows, mk);
61
+ expect(hits).toEqual([]);
62
+ });
63
+
64
+ it("threshold defaults to 0 when unset (returns all rows)", () => {
65
+ const record: Row = { name: "Alice" };
66
+ const rows: Row[] = [r(0, "Alice"), r(1, "Zoltan")];
67
+ // Construct without a threshold so matchOne's default-of-0 kicks in.
68
+ const mkNoThreshold = {
69
+ name: "n",
70
+ type: "weighted",
71
+ fields: [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })],
72
+ } as unknown as Parameters<typeof matchOne>[2];
73
+ const hits = matchOne(record, rows, mkNoThreshold);
74
+ expect(hits.length).toBe(2);
75
+ });
76
+ });
77
+
78
+ describe("findExactMatchesOne", () => {
79
+ it("finds exact composite-key matches only, with score 1.0", () => {
80
+ const record: Row = { email: "alice@example.com" };
81
+ const rows: Row[] = [
82
+ { __row_id__: 0, email: "alice@example.com" },
83
+ { __row_id__: 1, email: "Alice@Example.com" }, // matches after lowercase
84
+ { __row_id__: 2, email: "bob@example.com" },
85
+ ];
86
+ const mk = makeMatchkeyConfig({
87
+ name: "email_exact",
88
+ type: "exact",
89
+ fields: [
90
+ makeMatchkeyField({
91
+ field: "email",
92
+ transforms: ["lowercase"],
93
+ scorer: "exact",
94
+ }),
95
+ ],
96
+ });
97
+ const hits = findExactMatchesOne(record, rows, mk);
98
+ const ids = hits.map((h) => h.rowId).sort();
99
+ expect(ids).toEqual([0, 1]);
100
+ for (const h of hits) expect(h.score).toBe(1.0);
101
+ });
102
+
103
+ it("returns empty array when probe has null transform for any field", () => {
104
+ const record: Row = { email: null };
105
+ const rows: Row[] = [{ __row_id__: 0, email: "alice@example.com" }];
106
+ const mk = makeMatchkeyConfig({
107
+ name: "email_exact",
108
+ type: "exact",
109
+ fields: [makeMatchkeyField({ field: "email", transforms: [], scorer: "exact" })],
110
+ });
111
+ const hits = findExactMatchesOne(record, rows, mk);
112
+ expect(hits).toEqual([]);
113
+ });
114
+
115
+ it("skips rows where any field transforms to null", () => {
116
+ const record: Row = { email: "alice@example.com" };
117
+ const rows: Row[] = [
118
+ { __row_id__: 0, email: null },
119
+ { __row_id__: 1, email: "alice@example.com" },
120
+ ];
121
+ const mk = makeMatchkeyConfig({
122
+ name: "email_exact",
123
+ type: "exact",
124
+ fields: [makeMatchkeyField({ field: "email", scorer: "exact" })],
125
+ });
126
+ const hits = findExactMatchesOne(record, rows, mk);
127
+ expect(hits.map((h) => h.rowId)).toEqual([1]);
128
+ });
129
+ });
@@ -0,0 +1,97 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ computeMatchkeyValue,
4
+ computeMatchkeys,
5
+ addRowIds,
6
+ addSourceColumn,
7
+ } from "../../src/core/index.js";
8
+ import type { MatchkeyConfig, Row } from "../../src/core/index.js";
9
+
10
+ describe("computeMatchkeyValue", () => {
11
+ it("single field", () => {
12
+ const row: Row = { email: "john@example.com" };
13
+ const mk: MatchkeyConfig = {
14
+ name: "email",
15
+ type: "exact",
16
+ fields: [{ field: "email", transforms: ["lowercase"], scorer: "exact", weight: 1.0 }],
17
+ };
18
+ expect(computeMatchkeyValue(row, mk)).toBe("john@example.com");
19
+ });
20
+
21
+ it("multiple fields joined with ||", () => {
22
+ const row: Row = { first: "John", last: "Smith" };
23
+ const mk: MatchkeyConfig = {
24
+ name: "name",
25
+ type: "exact",
26
+ fields: [
27
+ { field: "first", transforms: [], scorer: "exact", weight: 1.0 },
28
+ { field: "last", transforms: [], scorer: "exact", weight: 1.0 },
29
+ ],
30
+ };
31
+ expect(computeMatchkeyValue(row, mk)).toBe("John||Smith");
32
+ });
33
+
34
+ it("null field returns null matchkey", () => {
35
+ const row: Row = { email: null };
36
+ const mk: MatchkeyConfig = {
37
+ name: "email",
38
+ type: "exact",
39
+ fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
40
+ };
41
+ expect(computeMatchkeyValue(row, mk)).toBe(null);
42
+ });
43
+
44
+ it("applies transform chain", () => {
45
+ const row: Row = { email: " JOHN@X.COM " };
46
+ const mk: MatchkeyConfig = {
47
+ name: "email",
48
+ type: "exact",
49
+ fields: [
50
+ { field: "email", transforms: ["lowercase", "strip"], scorer: "exact", weight: 1.0 },
51
+ ],
52
+ };
53
+ expect(computeMatchkeyValue(row, mk)).toBe("john@x.com");
54
+ });
55
+ });
56
+
57
+ describe("computeMatchkeys", () => {
58
+ it("adds __mk_{name}__ columns", () => {
59
+ const rows: Row[] = [{ email: "a@x.com" }, { email: "b@x.com" }];
60
+ const mks: MatchkeyConfig[] = [
61
+ {
62
+ name: "email_mk",
63
+ type: "exact",
64
+ fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
65
+ },
66
+ ];
67
+ const out = computeMatchkeys(rows, mks);
68
+ expect(out[0]!.__mk_email_mk__).toBe("a@x.com");
69
+ expect(out[1]!.__mk_email_mk__).toBe("b@x.com");
70
+ });
71
+ });
72
+
73
+ describe("addRowIds", () => {
74
+ it("adds sequential __row_id__ starting at 0", () => {
75
+ const rows: Row[] = [{ a: 1 }, { a: 2 }];
76
+ const out = addRowIds(rows);
77
+ expect(out[0]!.__row_id__).toBe(0);
78
+ expect(out[1]!.__row_id__).toBe(1);
79
+ });
80
+
81
+ it("supports offset", () => {
82
+ const rows: Row[] = [{ a: 1 }, { a: 2 }];
83
+ const out = addRowIds(rows, 10);
84
+ expect(out[0]!.__row_id__).toBe(10);
85
+ expect(out[1]!.__row_id__).toBe(11);
86
+ });
87
+ });
88
+
89
+ describe("addSourceColumn", () => {
90
+ it("adds __source__ to every row", () => {
91
+ const rows: Row[] = [{ a: 1 }, { a: 2 }];
92
+ const out = addSourceColumn(rows, "crm");
93
+ for (const r of out) {
94
+ expect(r.__source__).toBe("crm");
95
+ }
96
+ });
97
+ });
@@ -0,0 +1,183 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { TOOLS, handleTool } from "../../src/node/mcp/server.js";
3
+
4
+ describe("MCP server — TOOLS metadata", () => {
5
+ it("exports a non-empty array of tool definitions", () => {
6
+ expect(Array.isArray(TOOLS)).toBe(true);
7
+ expect(TOOLS.length).toBeGreaterThan(0);
8
+ });
9
+
10
+ it("each tool has name, description, inputSchema", () => {
11
+ for (const tool of TOOLS) {
12
+ expect(typeof tool.name).toBe("string");
13
+ expect(tool.name.length).toBeGreaterThan(0);
14
+ expect(typeof tool.description).toBe("string");
15
+ expect(tool.description.length).toBeGreaterThan(0);
16
+ expect(tool.inputSchema).toBeTypeOf("object");
17
+ expect(tool.inputSchema).not.toBeNull();
18
+ }
19
+ });
20
+
21
+ it("every tool name is unique", () => {
22
+ const names = TOOLS.map((t) => t.name);
23
+ const unique = new Set(names);
24
+ expect(unique.size).toBe(names.length);
25
+ });
26
+
27
+ it("includes core tools (dedupe, score_strings, profile, etc.)", () => {
28
+ const names = new Set(TOOLS.map((t) => t.name));
29
+ for (const expected of [
30
+ "dedupe",
31
+ "match",
32
+ "score_strings",
33
+ "explain_pair",
34
+ "profile",
35
+ "list_scorers",
36
+ "list_transforms",
37
+ "list_strategies",
38
+ "server_info",
39
+ ]) {
40
+ expect(names.has(expected)).toBe(true);
41
+ }
42
+ });
43
+ });
44
+
45
+ describe("MCP server — handleTool dispatcher", () => {
46
+ it("score_strings with jaro_winkler scores John~Jon near 0.94", async () => {
47
+ const result = (await handleTool("score_strings", {
48
+ a: "John",
49
+ b: "Jon",
50
+ scorer: "jaro_winkler",
51
+ })) as { score: number; scorer: string };
52
+ expect(result).toMatchObject({ scorer: "jaro_winkler" });
53
+ expect(typeof result.score).toBe("number");
54
+ expect(result.score).toBeGreaterThan(0.9);
55
+ expect(result.score).toBeLessThanOrEqual(1.0);
56
+ });
57
+
58
+ it("score_strings with missing scorer defaults to jaro_winkler", async () => {
59
+ const result = (await handleTool("score_strings", {
60
+ a: "x",
61
+ b: "y",
62
+ })) as { score: number; scorer: string };
63
+ expect(result.scorer).toBe("jaro_winkler");
64
+ expect(typeof result.score).toBe("number");
65
+ });
66
+
67
+ it("explain_pair returns an NL explanation", async () => {
68
+ const result = (await handleTool("explain_pair", {
69
+ row_a: { name: "John Smith", email: "j@x.com" },
70
+ row_b: { name: "Jon Smith", email: "j@x.com" },
71
+ fields: [
72
+ { field: "name", scorer: "jaro_winkler", weight: 1.0 },
73
+ { field: "email", scorer: "exact", weight: 1.0 },
74
+ ],
75
+ })) as {
76
+ score: number;
77
+ confidence: number;
78
+ explanation: string;
79
+ field_scores: unknown;
80
+ };
81
+ expect(typeof result.score).toBe("number");
82
+ // confidence may be numeric or categorical ("high"/"medium"/"low")
83
+ expect(["number", "string"]).toContain(typeof result.confidence);
84
+ expect(typeof result.explanation).toBe("string");
85
+ expect(result.explanation.length).toBeGreaterThan(0);
86
+ expect(result.field_scores).toBeDefined();
87
+ });
88
+
89
+ it("profile returns column profiles via { rows } -> requires path; use find_fuzzy_matches path flow via file is out of scope here", async () => {
90
+ // Note: profile tool takes a `path`. Rows-based profile is only via /profile REST.
91
+ // Verify that profile without a valid path returns an error object (not throws).
92
+ const result = (await handleTool("profile", { path: "nonexistent_file_xyz.csv" })) as {
93
+ error?: string;
94
+ };
95
+ expect(typeof result).toBe("object");
96
+ expect(result).not.toBeNull();
97
+ // Either error shape or some profile shape; not a crash.
98
+ expect(result.error ?? "").not.toBe(undefined);
99
+ });
100
+
101
+ it("list_scorers returns an array of scorer names", async () => {
102
+ const result = (await handleTool("list_scorers", {})) as { scorers: string[] };
103
+ expect(Array.isArray(result.scorers)).toBe(true);
104
+ expect(result.scorers.length).toBeGreaterThan(0);
105
+ expect(result.scorers).toContain("jaro_winkler");
106
+ });
107
+
108
+ it("list_transforms returns array", async () => {
109
+ const result = (await handleTool("list_transforms", {})) as { transforms: string[] };
110
+ expect(Array.isArray(result.transforms)).toBe(true);
111
+ expect(result.transforms.length).toBeGreaterThan(0);
112
+ });
113
+
114
+ it("list_strategies returns array", async () => {
115
+ const result = (await handleTool("list_strategies", {})) as { strategies: string[] };
116
+ expect(Array.isArray(result.strategies)).toBe(true);
117
+ expect(result.strategies.length).toBeGreaterThan(0);
118
+ });
119
+
120
+ it("server_info returns metadata with tool_count", async () => {
121
+ const result = (await handleTool("server_info", {})) as {
122
+ name: string;
123
+ tool_count: number;
124
+ };
125
+ expect(result.name).toBe("goldenmatch-js");
126
+ expect(result.tool_count).toBe(TOOLS.length);
127
+ });
128
+
129
+ it("unknown tool returns { error } rather than throwing", async () => {
130
+ const result = (await handleTool("nonexistent_tool_xyz", {})) as { error: string };
131
+ expect(typeof result).toBe("object");
132
+ expect(typeof result.error).toBe("string");
133
+ expect(result.error).toMatch(/unknown/i);
134
+ });
135
+
136
+ it("path traversal via '..' is rejected (error, not crash)", async () => {
137
+ const result = (await handleTool("read_file", {
138
+ file_path: "../../../etc/passwd",
139
+ path: "../../../etc/passwd",
140
+ })) as { error?: string };
141
+ expect(typeof result).toBe("object");
142
+ expect(typeof result.error).toBe("string");
143
+ expect(result.error).toMatch(/outside|not a|no such|enoent/i);
144
+ });
145
+
146
+ it("absolute path outside cwd is rejected", async () => {
147
+ // Pick a path guaranteed to be outside cwd on both Windows and POSIX.
148
+ const outsidePath =
149
+ process.platform === "win32" ? "C:\\Windows\\System32\\drivers\\etc\\hosts" : "/etc/passwd";
150
+ const result = (await handleTool("read_file", { path: outsidePath })) as {
151
+ error?: string;
152
+ };
153
+ expect(typeof result).toBe("object");
154
+ expect(typeof result.error).toBe("string");
155
+ expect(result.error).toMatch(/outside|enoent|no such|not found/i);
156
+ });
157
+
158
+ it("write_csv with non-array rows returns error (not crash)", async () => {
159
+ const result = (await handleTool("write_csv", {
160
+ path: "some_output.csv",
161
+ rows: "not-an-array",
162
+ })) as { error?: string };
163
+ expect(typeof result).toBe("object");
164
+ expect(typeof result.error).toBe("string");
165
+ });
166
+
167
+ it("score_pair with missing rows returns error", async () => {
168
+ const result = (await handleTool("score_pair", {
169
+ fields: [{ field: "name" }],
170
+ })) as { error?: string };
171
+ expect(typeof result.error).toBe("string");
172
+ });
173
+
174
+ it("score_pair with valid inputs returns a score", async () => {
175
+ const result = (await handleTool("score_pair", {
176
+ row_a: { name: "John" },
177
+ row_b: { name: "Jon" },
178
+ fields: [{ field: "name", scorer: "jaro_winkler", weight: 1.0 }],
179
+ })) as { score: number; field_count: number };
180
+ expect(typeof result.score).toBe("number");
181
+ expect(result.field_count).toBe(1);
182
+ });
183
+ });
@@ -0,0 +1,119 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { MemoryStore, MemoryLearner } from "../../src/core/index.js";
3
+ import type { Correction, MatchkeyConfig } from "../../src/core/index.js";
4
+
5
+ function makeCorrection(
6
+ rowIdA: number,
7
+ rowIdB: number,
8
+ verdict: "match" | "no_match",
9
+ score: number,
10
+ ): Correction {
11
+ return {
12
+ rowIdA,
13
+ rowIdB,
14
+ verdict,
15
+ feature: "overall",
16
+ score,
17
+ timestamp: Date.now(),
18
+ trust: 0.9,
19
+ source: "test",
20
+ };
21
+ }
22
+
23
+ describe("MemoryStore", () => {
24
+ it("add + list + count", () => {
25
+ const store = new MemoryStore();
26
+ expect(store.count()).toBe(0);
27
+ store.add(makeCorrection(1, 2, "match", 0.9));
28
+ store.add(makeCorrection(3, 4, "no_match", 0.3));
29
+ expect(store.count()).toBe(2);
30
+ expect(store.list().length).toBe(2);
31
+ });
32
+
33
+ it("listMatches and listNonMatches", () => {
34
+ const store = new MemoryStore();
35
+ store.add(makeCorrection(1, 2, "match", 0.9));
36
+ store.add(makeCorrection(3, 4, "no_match", 0.3));
37
+ expect(store.listMatches().length).toBe(1);
38
+ expect(store.listNonMatches().length).toBe(1);
39
+ });
40
+
41
+ it("clear resets the store", () => {
42
+ const store = new MemoryStore();
43
+ store.add(makeCorrection(1, 2, "match", 0.9));
44
+ store.clear();
45
+ expect(store.count()).toBe(0);
46
+ });
47
+
48
+ it("upsert with higher trust replaces existing", () => {
49
+ const store = new MemoryStore();
50
+ const c1: Correction = {
51
+ rowIdA: 1,
52
+ rowIdB: 2,
53
+ verdict: "match",
54
+ feature: "name",
55
+ score: 0.9,
56
+ timestamp: 1000,
57
+ trust: 0.5,
58
+ source: "a",
59
+ };
60
+ const c2: Correction = { ...c1, trust: 0.9, source: "b", timestamp: 2000 };
61
+ store.upsert(c1);
62
+ store.upsert(c2);
63
+ expect(store.count()).toBe(1);
64
+ expect(store.list()[0]!.source).toBe("b");
65
+ });
66
+ });
67
+
68
+ describe("MemoryLearner", () => {
69
+ it("tunes threshold when given >= 10 corrections with mixed verdicts", () => {
70
+ const baseline: MatchkeyConfig = {
71
+ name: "m",
72
+ type: "weighted",
73
+ threshold: 0.85,
74
+ fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
75
+ };
76
+ const corrections: Correction[] = [];
77
+ // 10 positives with score > 0.8, 10 negatives with score < 0.7
78
+ for (let i = 0; i < 10; i++) {
79
+ corrections.push(makeCorrection(i, i + 100, "match", 0.85));
80
+ corrections.push(makeCorrection(i + 200, i + 300, "no_match", 0.4));
81
+ }
82
+ const learner = new MemoryLearner();
83
+ const params = learner.learn(corrections, baseline);
84
+ expect(params.correctionCount).toBe(20);
85
+ expect(params.threshold).not.toBeUndefined();
86
+ // Optimal threshold should be somewhere between 0.4 and 0.85
87
+ expect(params.threshold!).toBeGreaterThanOrEqual(0.5);
88
+ expect(params.threshold!).toBeLessThanOrEqual(0.95);
89
+ });
90
+
91
+ it("returns no threshold when fewer than 10 corrections", () => {
92
+ const baseline: MatchkeyConfig = {
93
+ name: "m",
94
+ type: "weighted",
95
+ threshold: 0.85,
96
+ fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
97
+ };
98
+ const corrections = [makeCorrection(1, 2, "match", 0.9)];
99
+ const learner = new MemoryLearner();
100
+ const params = learner.learn(corrections, baseline);
101
+ expect(params.threshold).toBeUndefined();
102
+ });
103
+
104
+ it("returns no threshold when all verdicts are the same", () => {
105
+ const baseline: MatchkeyConfig = {
106
+ name: "m",
107
+ type: "weighted",
108
+ threshold: 0.85,
109
+ fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
110
+ };
111
+ const corrections: Correction[] = [];
112
+ for (let i = 0; i < 15; i++) {
113
+ corrections.push(makeCorrection(i, i + 100, "match", 0.9));
114
+ }
115
+ const learner = new MemoryLearner();
116
+ const params = learner.learn(corrections, baseline);
117
+ expect(params.threshold).toBeUndefined();
118
+ });
119
+ });