goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* api/server.ts -- GoldenMatch REST API server (node:http).
|
|
3
|
+
*
|
|
4
|
+
* Node-only: uses node:http, node:path. NOT edge-safe.
|
|
5
|
+
*
|
|
6
|
+
* Endpoints:
|
|
7
|
+
* GET /health - liveness check
|
|
8
|
+
* POST /dedupe - dedupe a batch of rows (JSON body)
|
|
9
|
+
* POST /match - match target vs reference
|
|
10
|
+
* POST /score - score two strings
|
|
11
|
+
* POST /explain - explain a pair
|
|
12
|
+
* POST /profile - profile a batch of rows
|
|
13
|
+
* POST /clusters - return clusters from dedupe
|
|
14
|
+
* GET /reviews - list pending review items
|
|
15
|
+
* POST /reviews/decide - accept/reject a review item
|
|
16
|
+
*
|
|
17
|
+
* Ports ideas from goldenmatch/api/server.py.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import {
|
|
21
|
+
createServer,
|
|
22
|
+
type IncomingMessage,
|
|
23
|
+
type ServerResponse,
|
|
24
|
+
} from "node:http";
|
|
25
|
+
import { resolve, isAbsolute, sep } from "node:path";
|
|
26
|
+
import { dedupe, match, scoreStrings } from "../../core/api.js";
|
|
27
|
+
import type { Row } from "../../core/types.js";
|
|
28
|
+
import {
|
|
29
|
+
makeMatchkeyConfig,
|
|
30
|
+
makeMatchkeyField,
|
|
31
|
+
} from "../../core/types.js";
|
|
32
|
+
import { explainPair } from "../../core/explain.js";
|
|
33
|
+
import { profileRows } from "../../core/profiler.js";
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// In-memory review queue
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
interface ReviewItem {
|
|
40
|
+
readonly id: string;
|
|
41
|
+
readonly idA: number;
|
|
42
|
+
readonly idB: number;
|
|
43
|
+
readonly score: number;
|
|
44
|
+
readonly rowA: Row;
|
|
45
|
+
readonly rowB: Row;
|
|
46
|
+
status: "pending" | "accepted" | "rejected";
|
|
47
|
+
decidedAt?: string;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
class ReviewQueue {
|
|
51
|
+
private items = new Map<string, ReviewItem>();
|
|
52
|
+
|
|
53
|
+
enqueue(item: Omit<ReviewItem, "status" | "id"> & { id?: string }): ReviewItem {
|
|
54
|
+
const id = item.id ?? `${item.idA}:${item.idB}`;
|
|
55
|
+
const rec: ReviewItem = {
|
|
56
|
+
id,
|
|
57
|
+
idA: item.idA,
|
|
58
|
+
idB: item.idB,
|
|
59
|
+
score: item.score,
|
|
60
|
+
rowA: item.rowA,
|
|
61
|
+
rowB: item.rowB,
|
|
62
|
+
status: "pending",
|
|
63
|
+
};
|
|
64
|
+
this.items.set(id, rec);
|
|
65
|
+
return rec;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
pending(): ReviewItem[] {
|
|
69
|
+
return [...this.items.values()].filter((r) => r.status === "pending");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
decide(id: string, accept: boolean): ReviewItem | null {
|
|
73
|
+
const existing = this.items.get(id);
|
|
74
|
+
if (!existing) return null;
|
|
75
|
+
existing.status = accept ? "accepted" : "rejected";
|
|
76
|
+
existing.decidedAt = new Date().toISOString();
|
|
77
|
+
return existing;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
all(): ReviewItem[] {
|
|
81
|
+
return [...this.items.values()];
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const reviewQueue = new ReviewQueue();
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// Helpers
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
export function sanitizePath(raw: string): string {
|
|
92
|
+
const resolved = isAbsolute(raw) ? resolve(raw) : resolve(process.cwd(), raw);
|
|
93
|
+
const cwd = resolve(process.cwd());
|
|
94
|
+
// Guard against prefix-bypass: cwd="/app/foo" must NOT accept "/app/foobar".
|
|
95
|
+
if (resolved !== cwd && !resolved.startsWith(cwd + sep)) {
|
|
96
|
+
throw new Error(`Path '${raw}' is outside the working directory`);
|
|
97
|
+
}
|
|
98
|
+
return resolved;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function readBody(req: IncomingMessage): Promise<string> {
|
|
102
|
+
let body = "";
|
|
103
|
+
for await (const chunk of req) {
|
|
104
|
+
body += typeof chunk === "string" ? chunk : (chunk as Buffer).toString("utf8");
|
|
105
|
+
}
|
|
106
|
+
return body;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
async function readJsonBody(req: IncomingMessage): Promise<Record<string, unknown>> {
|
|
110
|
+
const raw = await readBody(req);
|
|
111
|
+
if (!raw) return {};
|
|
112
|
+
try {
|
|
113
|
+
const parsed = JSON.parse(raw);
|
|
114
|
+
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
115
|
+
throw new Error("request body must be a JSON object");
|
|
116
|
+
}
|
|
117
|
+
return parsed as Record<string, unknown>;
|
|
118
|
+
} catch (err) {
|
|
119
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
120
|
+
throw new Error(`invalid JSON body: ${msg}`);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function sendJson(res: ServerResponse, status: number, data: unknown): void {
|
|
125
|
+
res.statusCode = status;
|
|
126
|
+
res.setHeader("Content-Type", "application/json");
|
|
127
|
+
res.end(JSON.stringify(data));
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function asRowArray(v: unknown, label: string): Row[] {
|
|
131
|
+
if (!Array.isArray(v)) throw new Error(`${label} must be an array of objects`);
|
|
132
|
+
return v as Row[];
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
interface ShorthandOpts {
|
|
136
|
+
exact?: readonly string[];
|
|
137
|
+
fuzzy?: Readonly<Record<string, number>>;
|
|
138
|
+
blocking?: readonly string[];
|
|
139
|
+
threshold?: number;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function extractShorthand(body: Record<string, unknown>): ShorthandOpts {
|
|
143
|
+
const out: {
|
|
144
|
+
exact?: readonly string[];
|
|
145
|
+
fuzzy?: Readonly<Record<string, number>>;
|
|
146
|
+
blocking?: readonly string[];
|
|
147
|
+
threshold?: number;
|
|
148
|
+
} = {};
|
|
149
|
+
if (Array.isArray(body["exact"])) out.exact = body["exact"].map(String);
|
|
150
|
+
if (Array.isArray(body["blocking"])) out.blocking = body["blocking"].map(String);
|
|
151
|
+
if (body["fuzzy"] && typeof body["fuzzy"] === "object" && !Array.isArray(body["fuzzy"])) {
|
|
152
|
+
const f: Record<string, number> = {};
|
|
153
|
+
for (const [k, v] of Object.entries(body["fuzzy"] as Record<string, unknown>)) {
|
|
154
|
+
const n = typeof v === "number" ? v : Number(v);
|
|
155
|
+
if (Number.isFinite(n)) f[k] = n;
|
|
156
|
+
}
|
|
157
|
+
out.fuzzy = f;
|
|
158
|
+
}
|
|
159
|
+
if (typeof body["threshold"] === "number") out.threshold = body["threshold"];
|
|
160
|
+
return out;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
// Route handlers
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
async function handleRequest(
|
|
168
|
+
req: IncomingMessage,
|
|
169
|
+
res: ServerResponse,
|
|
170
|
+
): Promise<void> {
|
|
171
|
+
const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
|
|
172
|
+
const pathname = url.pathname;
|
|
173
|
+
const method = req.method ?? "GET";
|
|
174
|
+
|
|
175
|
+
try {
|
|
176
|
+
if (pathname === "/health" && method === "GET") {
|
|
177
|
+
sendJson(res, 200, { status: "ok", service: "goldenmatch-js" });
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (pathname === "/dedupe" && method === "POST") {
|
|
182
|
+
const body = await readJsonBody(req);
|
|
183
|
+
const rows = asRowArray(body["rows"], "rows");
|
|
184
|
+
const options = extractShorthand(body);
|
|
185
|
+
const result = dedupe(rows, options);
|
|
186
|
+
sendJson(res, 200, {
|
|
187
|
+
stats: {
|
|
188
|
+
total_records: result.stats.totalRecords,
|
|
189
|
+
total_clusters: result.stats.totalClusters,
|
|
190
|
+
match_rate: result.stats.matchRate,
|
|
191
|
+
matched_records: result.stats.matchedRecords,
|
|
192
|
+
unique_records: result.stats.uniqueRecords,
|
|
193
|
+
},
|
|
194
|
+
golden_records: result.goldenRecords,
|
|
195
|
+
dupes: result.dupes,
|
|
196
|
+
unique: result.unique,
|
|
197
|
+
});
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (pathname === "/match" && method === "POST") {
|
|
202
|
+
const body = await readJsonBody(req);
|
|
203
|
+
const target = asRowArray(body["target"], "target");
|
|
204
|
+
const reference = asRowArray(body["reference"], "reference");
|
|
205
|
+
const options = extractShorthand(body);
|
|
206
|
+
const result = match(
|
|
207
|
+
target.map((r) => ({ ...r, __source__: "target" })),
|
|
208
|
+
reference.map((r) => ({ ...r, __source__: "reference" })),
|
|
209
|
+
options,
|
|
210
|
+
);
|
|
211
|
+
sendJson(res, 200, {
|
|
212
|
+
matched: result.matched,
|
|
213
|
+
unmatched: result.unmatched,
|
|
214
|
+
stats: result.stats,
|
|
215
|
+
});
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (pathname === "/score" && method === "POST") {
|
|
220
|
+
const body = await readJsonBody(req);
|
|
221
|
+
const a = String(body["a"] ?? "");
|
|
222
|
+
const b = String(body["b"] ?? "");
|
|
223
|
+
const scorer = typeof body["scorer"] === "string" ? (body["scorer"] as string) : "jaro_winkler";
|
|
224
|
+
sendJson(res, 200, { scorer, score: scoreStrings(a, b, scorer) });
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (pathname === "/explain" && method === "POST") {
|
|
229
|
+
const body = await readJsonBody(req);
|
|
230
|
+
const rowA = body["row_a"] as Row | undefined;
|
|
231
|
+
const rowB = body["row_b"] as Row | undefined;
|
|
232
|
+
if (!rowA || !rowB) throw new Error("row_a and row_b are required");
|
|
233
|
+
const fieldsRaw = body["fields"];
|
|
234
|
+
if (!Array.isArray(fieldsRaw)) {
|
|
235
|
+
throw new Error("fields must be an array");
|
|
236
|
+
}
|
|
237
|
+
const fields = fieldsRaw.map((entry) => {
|
|
238
|
+
if (!entry || typeof entry !== "object") {
|
|
239
|
+
throw new Error("each field must be an object");
|
|
240
|
+
}
|
|
241
|
+
const e = entry as Record<string, unknown>;
|
|
242
|
+
if (typeof e["field"] !== "string") {
|
|
243
|
+
throw new Error("each field needs a 'field' property");
|
|
244
|
+
}
|
|
245
|
+
return makeMatchkeyField({
|
|
246
|
+
field: e["field"] as string,
|
|
247
|
+
transforms: Array.isArray(e["transforms"])
|
|
248
|
+
? (e["transforms"] as unknown[]).map(String)
|
|
249
|
+
: ["lowercase", "strip"],
|
|
250
|
+
scorer: typeof e["scorer"] === "string" ? (e["scorer"] as string) : "jaro_winkler",
|
|
251
|
+
weight: typeof e["weight"] === "number" ? (e["weight"] as number) : 1.0,
|
|
252
|
+
});
|
|
253
|
+
});
|
|
254
|
+
const threshold = typeof body["threshold"] === "number" ? (body["threshold"] as number) : 0.85;
|
|
255
|
+
const mk = makeMatchkeyConfig({
|
|
256
|
+
name: "adhoc",
|
|
257
|
+
type: "weighted",
|
|
258
|
+
fields,
|
|
259
|
+
threshold,
|
|
260
|
+
});
|
|
261
|
+
const explanation = explainPair(rowA, rowB, mk);
|
|
262
|
+
sendJson(res, 200, {
|
|
263
|
+
score: explanation.score,
|
|
264
|
+
confidence: explanation.confidence,
|
|
265
|
+
explanation: explanation.explanation,
|
|
266
|
+
field_scores: explanation.fieldScores,
|
|
267
|
+
});
|
|
268
|
+
return;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (pathname === "/profile" && method === "POST") {
|
|
272
|
+
const body = await readJsonBody(req);
|
|
273
|
+
const rows = asRowArray(body["rows"], "rows");
|
|
274
|
+
const profile = profileRows(rows);
|
|
275
|
+
sendJson(res, 200, {
|
|
276
|
+
row_count: profile.rowCount,
|
|
277
|
+
columns: profile.columns.map((c) => ({
|
|
278
|
+
name: c.name,
|
|
279
|
+
inferred_type: c.inferredType,
|
|
280
|
+
null_count: c.nullCount,
|
|
281
|
+
null_rate: c.nullRate,
|
|
282
|
+
distinct_count: c.distinctCount,
|
|
283
|
+
cardinality_ratio: c.cardinalityRatio,
|
|
284
|
+
avg_length: c.avgLength,
|
|
285
|
+
max_length: c.maxLength,
|
|
286
|
+
sample_values: c.sampleValues,
|
|
287
|
+
})),
|
|
288
|
+
});
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (pathname === "/clusters" && method === "POST") {
|
|
293
|
+
const body = await readJsonBody(req);
|
|
294
|
+
const rows = asRowArray(body["rows"], "rows");
|
|
295
|
+
const options = extractShorthand(body);
|
|
296
|
+
const result = dedupe(rows, options);
|
|
297
|
+
const clusters: Array<{
|
|
298
|
+
cluster_id: number;
|
|
299
|
+
size: number;
|
|
300
|
+
confidence: number;
|
|
301
|
+
quality: string;
|
|
302
|
+
members: readonly number[];
|
|
303
|
+
}> = [];
|
|
304
|
+
for (const [cid, info] of result.clusters.entries()) {
|
|
305
|
+
clusters.push({
|
|
306
|
+
cluster_id: cid,
|
|
307
|
+
size: info.size,
|
|
308
|
+
confidence: info.confidence,
|
|
309
|
+
quality: info.clusterQuality,
|
|
310
|
+
members: info.members,
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
sendJson(res, 200, {
|
|
314
|
+
cluster_count: clusters.length,
|
|
315
|
+
clusters,
|
|
316
|
+
});
|
|
317
|
+
return;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if (pathname === "/reviews" && method === "GET") {
|
|
321
|
+
sendJson(res, 200, { pending: reviewQueue.pending() });
|
|
322
|
+
return;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (pathname === "/reviews/decide" && method === "POST") {
|
|
326
|
+
const body = await readJsonBody(req);
|
|
327
|
+
const id = String(body["id"] ?? "");
|
|
328
|
+
const accept = Boolean(body["accept"]);
|
|
329
|
+
if (id === "") throw new Error("id is required");
|
|
330
|
+
const decided = reviewQueue.decide(id, accept);
|
|
331
|
+
if (!decided) {
|
|
332
|
+
sendJson(res, 404, { error: `review item ${id} not found` });
|
|
333
|
+
return;
|
|
334
|
+
}
|
|
335
|
+
sendJson(res, 200, { decided });
|
|
336
|
+
return;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (pathname === "/reviews/enqueue" && method === "POST") {
|
|
340
|
+
const body = await readJsonBody(req);
|
|
341
|
+
const idA = Number(body["id_a"]);
|
|
342
|
+
const idB = Number(body["id_b"]);
|
|
343
|
+
const score = Number(body["score"]);
|
|
344
|
+
const rowA = body["row_a"] as Row | undefined;
|
|
345
|
+
const rowB = body["row_b"] as Row | undefined;
|
|
346
|
+
if (!Number.isFinite(idA) || !Number.isFinite(idB) || !rowA || !rowB) {
|
|
347
|
+
throw new Error("id_a, id_b, row_a, row_b are required");
|
|
348
|
+
}
|
|
349
|
+
const item = reviewQueue.enqueue({
|
|
350
|
+
idA,
|
|
351
|
+
idB,
|
|
352
|
+
score: Number.isFinite(score) ? score : 0,
|
|
353
|
+
rowA,
|
|
354
|
+
rowB,
|
|
355
|
+
});
|
|
356
|
+
sendJson(res, 200, { item });
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
sendJson(res, 404, { error: `Not found: ${method} ${pathname}` });
|
|
361
|
+
} catch (err) {
|
|
362
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
363
|
+
sendJson(res, 500, { error: msg });
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// ---------------------------------------------------------------------------
|
|
368
|
+
// Public: startApiServer
|
|
369
|
+
// ---------------------------------------------------------------------------
|
|
370
|
+
|
|
371
|
+
export interface StartApiOptions {
|
|
372
|
+
readonly port?: number;
|
|
373
|
+
readonly host?: string;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Start the REST API server.
|
|
378
|
+
* Default: http://127.0.0.1:8000.
|
|
379
|
+
*
|
|
380
|
+
* Returns the http.Server so tests can close it.
|
|
381
|
+
*/
|
|
382
|
+
export function startApiServer(options: StartApiOptions = {}): ReturnType<typeof createServer> {
|
|
383
|
+
const port = options.port ?? 8000;
|
|
384
|
+
const host = options.host ?? "127.0.0.1";
|
|
385
|
+
const server = createServer((req, res) => {
|
|
386
|
+
handleRequest(req, res).catch((err: unknown) => {
|
|
387
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
388
|
+
console.warn("Unhandled request error:", msg);
|
|
389
|
+
try {
|
|
390
|
+
if (!res.headersSent) {
|
|
391
|
+
res.statusCode = 500;
|
|
392
|
+
res.setHeader("Content-Type", "application/json");
|
|
393
|
+
res.end(JSON.stringify({ error: msg }));
|
|
394
|
+
}
|
|
395
|
+
} catch (writeErr) {
|
|
396
|
+
// Response already committed or socket closed — log and move on.
|
|
397
|
+
// eslint-disable-next-line no-console
|
|
398
|
+
console.warn(
|
|
399
|
+
"Failed to write API error response:",
|
|
400
|
+
writeErr instanceof Error ? writeErr.message : String(writeErr),
|
|
401
|
+
);
|
|
402
|
+
}
|
|
403
|
+
});
|
|
404
|
+
});
|
|
405
|
+
server.listen(port, host, () => {
|
|
406
|
+
// eslint-disable-next-line no-console
|
|
407
|
+
console.log(`GoldenMatch API listening on http://${host}:${port}`);
|
|
408
|
+
});
|
|
409
|
+
return server;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
export { reviewQueue, ReviewQueue };
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* duckdb.ts -- Optional DuckDB connector for Node.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors `goldenmatch.backends.duckdb_backend.DuckDBBackend` from Python.
|
|
5
|
+
*
|
|
6
|
+
* Peer dependency (NOT in package.json -- install on demand):
|
|
7
|
+
* npm install @duckdb/node-api
|
|
8
|
+
*
|
|
9
|
+
* The dep is loaded via `createRequire` so the package stays importable
|
|
10
|
+
* on edge runtimes and in environments without DuckDB.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { createRequire } from "node:module";
|
|
14
|
+
import type { Row } from "../../core/types.js";
|
|
15
|
+
|
|
16
|
+
export interface DuckDBConfig {
|
|
17
|
+
/** Database path. Defaults to `:memory:`. */
|
|
18
|
+
readonly path?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface DuckDBConnector {
|
|
22
|
+
readTable(table: string): Promise<Row[]>;
|
|
23
|
+
readQuery(sql: string): Promise<Row[]>;
|
|
24
|
+
writeTable(
|
|
25
|
+
table: string,
|
|
26
|
+
rows: readonly Row[],
|
|
27
|
+
schema?: Readonly<Record<string, string>>,
|
|
28
|
+
): Promise<void>;
|
|
29
|
+
listTables(): Promise<string[]>;
|
|
30
|
+
close(): void;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Create a DuckDB connector. Throws if `@duckdb/node-api` isn't installed.
|
|
35
|
+
*
|
|
36
|
+
* Async because the underlying DuckDB API is async-only (instance + connection
|
|
37
|
+
* setup both return Promises).
|
|
38
|
+
*/
|
|
39
|
+
export async function createDuckDBConnector(
|
|
40
|
+
config: DuckDBConfig = {},
|
|
41
|
+
): Promise<DuckDBConnector> {
|
|
42
|
+
const require = createRequire(import.meta.url);
|
|
43
|
+
let duckdb: {
|
|
44
|
+
DuckDBInstance: {
|
|
45
|
+
create: (path: string) => Promise<{
|
|
46
|
+
connect: () => Promise<DuckDBConnection>;
|
|
47
|
+
closeSync?: () => void;
|
|
48
|
+
}>;
|
|
49
|
+
};
|
|
50
|
+
};
|
|
51
|
+
try {
|
|
52
|
+
duckdb = require("@duckdb/node-api") as typeof duckdb;
|
|
53
|
+
} catch {
|
|
54
|
+
throw new Error(
|
|
55
|
+
"'@duckdb/node-api' is required for DuckDB support. Install: npm install @duckdb/node-api",
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const path = config.path ?? ":memory:";
|
|
60
|
+
const instance = await duckdb.DuckDBInstance.create(path);
|
|
61
|
+
const conn = await instance.connect();
|
|
62
|
+
|
|
63
|
+
const escapeIdent = (s: string): string => `"${s.replace(/"/g, '""')}"`;
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
async readTable(table: string): Promise<Row[]> {
|
|
67
|
+
const reader = await conn.runAndReadAll(`SELECT * FROM ${escapeIdent(table)}`);
|
|
68
|
+
return reader.getRowObjects() as Row[];
|
|
69
|
+
},
|
|
70
|
+
|
|
71
|
+
async readQuery(sql: string): Promise<Row[]> {
|
|
72
|
+
const reader = await conn.runAndReadAll(sql);
|
|
73
|
+
return reader.getRowObjects() as Row[];
|
|
74
|
+
},
|
|
75
|
+
|
|
76
|
+
async writeTable(
|
|
77
|
+
table: string,
|
|
78
|
+
rows: readonly Row[],
|
|
79
|
+
schema?: Readonly<Record<string, string>>,
|
|
80
|
+
): Promise<void> {
|
|
81
|
+
const tableIdent = escapeIdent(table);
|
|
82
|
+
|
|
83
|
+
if (rows.length === 0) {
|
|
84
|
+
// Empty -- create stub table from schema if provided, else no-op.
|
|
85
|
+
if (schema !== undefined) {
|
|
86
|
+
const colDefs = Object.entries(schema)
|
|
87
|
+
.map(([c, t]) => `${escapeIdent(c)} ${t}`)
|
|
88
|
+
.join(", ");
|
|
89
|
+
await conn.run(`CREATE TABLE IF NOT EXISTS ${tableIdent} (${colDefs})`);
|
|
90
|
+
}
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const first = rows[0]!;
|
|
95
|
+
const cols = Object.keys(first);
|
|
96
|
+
const colDefs = cols
|
|
97
|
+
.map((c) => `${escapeIdent(c)} ${schema?.[c] ?? "VARCHAR"}`)
|
|
98
|
+
.join(", ");
|
|
99
|
+
await conn.run(`CREATE TABLE IF NOT EXISTS ${tableIdent} (${colDefs})`);
|
|
100
|
+
|
|
101
|
+
const placeholders = cols.map(() => "?").join(", ");
|
|
102
|
+
const colList = cols.map(escapeIdent).join(", ");
|
|
103
|
+
const prepared = await conn.prepare(
|
|
104
|
+
`INSERT INTO ${tableIdent} (${colList}) VALUES (${placeholders})`,
|
|
105
|
+
);
|
|
106
|
+
|
|
107
|
+
for (const row of rows) {
|
|
108
|
+
const values = cols.map((c) => (row as Record<string, unknown>)[c]);
|
|
109
|
+
await prepared.run(...values);
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
|
|
113
|
+
async listTables(): Promise<string[]> {
|
|
114
|
+
const reader = await conn.runAndReadAll("SHOW TABLES");
|
|
115
|
+
const out = reader.getRowObjects() as Array<Record<string, unknown>>;
|
|
116
|
+
return out.map((r) => String(r["name"] ?? ""));
|
|
117
|
+
},
|
|
118
|
+
|
|
119
|
+
close(): void {
|
|
120
|
+
instance.closeSync?.();
|
|
121
|
+
},
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/** Internal: minimal shape of the @duckdb/node-api connection we touch. */
|
|
126
|
+
interface DuckDBConnection {
|
|
127
|
+
runAndReadAll(sql: string): Promise<{ getRowObjects(): unknown[] }>;
|
|
128
|
+
run(sql: string): Promise<unknown>;
|
|
129
|
+
prepare(sql: string): Promise<{ run(...values: unknown[]): Promise<unknown> }>;
|
|
130
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* score-worker.ts -- piscina worker that scores a single block.
|
|
3
|
+
*
|
|
4
|
+
* Invoked by piscina with input { block, mk, matchedPairs }.
|
|
5
|
+
* Returns the scored pairs for that block.
|
|
6
|
+
*
|
|
7
|
+
* Built as a separate tsup entry so it can be loaded by piscina from disk
|
|
8
|
+
* at runtime. The worker runs in its own V8 isolate (worker_thread) so CPU
|
|
9
|
+
* work here is truly parallel with the main thread and other workers.
|
|
10
|
+
*/
|
|
11
|
+
import { findFuzzyMatches } from "../../core/scorer.js";
|
|
12
|
+
import type {
|
|
13
|
+
BlockResult,
|
|
14
|
+
MatchkeyConfig,
|
|
15
|
+
PairKey,
|
|
16
|
+
ScoredPair,
|
|
17
|
+
} from "../../core/types.js";
|
|
18
|
+
|
|
19
|
+
export interface ScoreWorkerInput {
|
|
20
|
+
readonly block: BlockResult;
|
|
21
|
+
readonly mk: MatchkeyConfig;
|
|
22
|
+
/** Serialized Set<PairKey> contents -- piscina can't transfer Sets. */
|
|
23
|
+
readonly matchedPairs: readonly PairKey[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface ScoreWorkerOutput {
|
|
27
|
+
readonly pairs: readonly ScoredPair[];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export default function scoreWorker(
|
|
31
|
+
input: ScoreWorkerInput,
|
|
32
|
+
): ScoreWorkerOutput {
|
|
33
|
+
const excludeSet = new Set<PairKey>(input.matchedPairs);
|
|
34
|
+
const pairs = findFuzzyMatches(
|
|
35
|
+
input.block.rows,
|
|
36
|
+
input.mk,
|
|
37
|
+
excludeSet,
|
|
38
|
+
input.block.preScoredPairs,
|
|
39
|
+
);
|
|
40
|
+
return { pairs };
|
|
41
|
+
}
|