goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,554 @@
1
+ import { R as Row, G as GoldenMatchConfig, D as DedupeResult, a as MatchResult, B as BlockResult, M as MatchkeyConfig, P as PairKey, S as ScoredPair } from '../types-DhUdX5Rc.js';
2
+ export { b as BlockingConfig, c as BlockingKeyConfig, d as BudgetConfig, C as CanopyConfig, e as ClusterInfo, f as ClusterProvenance, g as ColumnValue, h as DedupeStats, i as DomainConfig, E as ExactMatchkey, F as FieldProvenance, j as GoldenFieldRule, k as GoldenRulesConfig, I as InputConfig, l as InputFileConfig, L as LLMScorerConfig, m as LearningConfig, n as MakeMatchkeyConfigInput, o as MatchkeyField, p as MemoryConfig, O as OutputConfig, q as ProbabilisticMatchkey, Q as QualityConfig, r as SortKeyField, s as StandardizationConfig, T as TransformConfig, V as VALID_SCORERS, t as VALID_STANDARDIZERS, u as VALID_STRATEGIES, v as VALID_TRANSFORMS, w as ValidationConfig, x as ValidationRuleConfig, W as WeightedMatchkey, y as getMatchkeys, z as makeBlockingConfig, A as makeConfig, H as makeGoldenRulesConfig, J as makeMatchkeyConfig, K as makeMatchkeyField, N as makeScoredPair } from '../types-DhUdX5Rc.js';
3
+ export { ANNBlocker, ANNBlockerBase, ANNBlockerOptions, AutoFixLog, AutoconfigOptions, BudgetSnapshot, BudgetTracker, BuildANNOptions, CCMSResult, ClusterExplanation, ColumnProfile, Correction, CreateANNBlockerOptions, CrossEncoderHttpError, CrossEncoderModel, CrossEncoderModelOptions, CrossEncoderOptions, CrossEncoderProvider, CrossEncoderReranker, DatasetProfile, DomainProfile, EMResult, Embedder, EmbedderError, EmbedderOptions, EmbedderProvider, EmbeddingResult, EvalResult, GatedResult, GraphERResult, HNSWANNBlocker, HNSWIndexLike, HNSWModule, HNSWOptions, LLMScoreResult, LearnedParams, LearnedPredicate, LearnedRules, LineageBundle, LineageEdge, MemoryLearner, MemoryStore, MemoryStoreConfig, PPRLConfig, PPRLResult, PairExplanation, QualityFinding, Relationship, ReviewItem, SensitivityResult, StreamProcessor, SweepParam, SweepPoint, TableSchema, TabularData, UnionFind, ValidationReport, ValidationRule, _resetCrossEncoderModelCache, addRowIds, addSourceColumn, addToCluster, applyColumnMap, applyCorrections, applyLearnedBlocks, applyStandardization, applyStandardizer, applyTransform, applyTransforms, asString, autoConfigurePPRL, autoConfigureRows, autoFixRows, buildANNBlocks, buildANNPairBlocks, buildAdaptiveBlocks, buildBlocks, buildBlocksAsync, buildClusters, buildComparisonVector, buildGoldenRecord, buildGoldenRecordWithProvenance, buildLineage, buildMst, buildMultiPassBlocks, buildStaticBlocks, compareClusters, computeClusterConfidence, computeMatchkeyValue, computeMatchkeys, concatRows, configToYaml, cosineSim, countTokensApprox, createANNBlocker, dedupe, detectDomain, diceCoefficient, ensembleScore, euclideanDist, evaluateClusters, evaluatePairs, explainCluster, explainPair, extractFeatures, findExactMatches, findExactMatchesOne, findFuzzyMatches, gatePairs, getClusterPairScores, getEmbedder, hashRow, indelDistance, indelSimilarity, isNullish, jaccardSimilarity, jaro, jaroWinkler, learnBlockingRules, levenshteinDistance, levenshteinSimilarity, lineageFromJson, lineageToJson, llmClusterPairs, llmScorePairs, loadGroundTruthPairs, match, matchOne, mergeField, metaphone, pairKey, parseConfig, parseConfigYaml, parsePairKey, profileRows, rerankPair, rerankTopPairs, runDedupePipeline, runGraphER, runMatchPipeline, runPPRL, runQualityCheck, runSensitivity, scanQuality, scoreBlocksSequential, scoreField, scoreMatrix, scorePair, scorePairRecord, scoreProbabilistic, scoreStrings, scoreStringsWithLlm, selectBestBlockingKey, soundex, soundexMatch, splitOversizedCluster, stabilityReport, toColumnValue, tokenSortRatio, trainEM, unmergeCluster, unmergeRecord, validateColumns, validateRows } from '../core/index.js';
4
+ import { createServer } from 'node:http';
5
+
6
+ /**
7
+ * file.ts -- CSV/JSON/JSONL file I/O connector.
8
+ *
9
+ * Node-only: uses node:fs, node:path. NOT edge-safe.
10
+ *
11
+ * CSV parser rules (CRITICAL):
12
+ * - Quoted fields preserve embedded commas and newlines
13
+ * - Doubled quotes inside quoted fields unescape to a single quote
14
+ * - Empty unquoted fields become null
15
+ * - Leading-zero strings (zip codes "01234", SSNs, phones) are NEVER
16
+ * coerced to numbers
17
+ * - Booleans "true"/"false" (case-insensitive) coerce to boolean
18
+ * - Numeric strings coerce to number only when fully numeric and not
19
+ * leading-zero
20
+ * - Supports both `\n` and `\r\n` line endings
21
+ */
22
+
23
+ interface ReadCsvOptions {
24
+ readonly delimiter?: string;
25
+ readonly hasHeader?: boolean;
26
+ readonly encoding?: BufferEncoding;
27
+ }
28
+ /**
29
+ * Read a CSV (or TSV, via `delimiter: "\t"`) file from disk.
30
+ *
31
+ * Returns an array of `Row` objects (header -> coerced value). If
32
+ * `hasHeader` is false, synthetic headers `col_0`, `col_1`, ... are used.
33
+ */
34
+ declare function readCsv(path: string, options?: ReadCsvOptions): Row[];
35
+ /**
36
+ * Read a JSON or JSONL file.
37
+ *
38
+ * - `.json`: expects an array of objects at the top level.
39
+ * - `.jsonl` / `.ndjson`: one JSON object per line.
40
+ *
41
+ * Auto-detected based on whether the first non-whitespace character is `[`.
42
+ */
43
+ declare function readJson(path: string): Row[];
44
+ /**
45
+ * Dispatch to readCsv / readJson based on file extension.
46
+ *
47
+ * Supported: `.csv`, `.tsv`, `.json`, `.jsonl`, `.ndjson`.
48
+ */
49
+ declare function readFile(path: string): Row[];
50
+ interface WriteCsvOptions {
51
+ readonly columns?: readonly string[];
52
+ readonly delimiter?: string;
53
+ }
54
+ /**
55
+ * Write rows to a CSV file. Creates parent directories as needed.
56
+ *
57
+ * If `columns` is not supplied, the union of keys from all rows is used,
58
+ * ordered by first appearance.
59
+ */
60
+ declare function writeCsv(path: string, rows: readonly Row[], options?: WriteCsvOptions): void;
61
+ /** Write rows to a JSON file as a pretty-printed array. */
62
+ declare function writeJson(path: string, rows: readonly Row[]): void;
63
+
64
+ /**
65
+ * dedupe-file.ts -- File-based convenience wrappers around dedupe() / match().
66
+ *
67
+ * Node-only: reads from disk, tags rows with __source__, delegates to the
68
+ * edge-safe core API.
69
+ */
70
+
71
+ /**
72
+ * File specification. Either a bare path (source name is derived from the
73
+ * file's basename without extension) or a tuple `[path, sourceName]`.
74
+ */
75
+ type FileSpec = string | readonly [string, string];
76
+ interface FileDedupeOptions {
77
+ /** Input files. Required when calling `dedupeFile(opts)`. */
78
+ readonly files?: readonly FileSpec[];
79
+ /** Full config -- takes precedence over shorthand fields below. */
80
+ readonly config?: GoldenMatchConfig;
81
+ readonly exact?: readonly string[];
82
+ readonly fuzzy?: Readonly<Record<string, number>>;
83
+ readonly blocking?: readonly string[];
84
+ readonly threshold?: number;
85
+ /** Enable LLM scorer for borderline pairs (not yet implemented in JS). */
86
+ readonly llmScorer?: boolean;
87
+ /** Write golden records to this path (.csv or .json). */
88
+ readonly outputPath?: string;
89
+ }
90
+ /**
91
+ * Deduplicate records across one or more files.
92
+ *
93
+ * Each file's rows are tagged with `__source__ = <sourceName>` before being
94
+ * concatenated and passed to `dedupe()`.
95
+ *
96
+ * @throws if no files are provided or any file cannot be read.
97
+ */
98
+ declare function dedupeFile(opts: FileDedupeOptions): DedupeResult;
99
+ /**
100
+ * Match target records against a reference file.
101
+ *
102
+ * Reads both files, tags with `__source__`, and delegates to `match()`.
103
+ * `opts.files` (if provided) is ignored in favor of the explicit paths.
104
+ */
105
+ declare function matchFiles(targetPath: string, referencePath: string, opts?: FileDedupeOptions): MatchResult;
106
+
107
+ /**
108
+ * config-file.ts -- YAML config loading/saving from disk.
109
+ *
110
+ * Node-only. Uses `createRequire` so the optional `yaml` peer dependency
111
+ * is resolved lazily without breaking edge-safe ESM builds.
112
+ */
113
+
114
+ /**
115
+ * Load and parse a YAML config file into a typed GoldenMatchConfig.
116
+ *
117
+ * @throws if the file cannot be read, `yaml` is not installed, or the
118
+ * document does not describe a valid config.
119
+ */
120
+ declare function loadConfigFile(path: string): GoldenMatchConfig;
121
+ /**
122
+ * Serialize a GoldenMatchConfig to YAML and write it to disk.
123
+ * Creates parent directories as needed.
124
+ */
125
+ declare function writeConfigFile(path: string, config: GoldenMatchConfig): void;
126
+
127
+ /**
128
+ * base.ts -- Base connector interface and registry.
129
+ *
130
+ * Mirrors goldenmatch.connectors.base from the Python package: a small
131
+ * abstraction over external data sources (Snowflake, BigQuery, etc.) that
132
+ * exposes connect/read/close lifecycle and a name-based registry.
133
+ */
134
+
135
+ interface ConnectorConfig {
136
+ readonly [key: string]: unknown;
137
+ }
138
+ interface ConnectorQuery {
139
+ readonly table: string;
140
+ readonly columns?: readonly string[];
141
+ readonly limit?: number;
142
+ }
143
+ interface BaseConnector {
144
+ readonly name: string;
145
+ connect(): Promise<void>;
146
+ read(query: string | ConnectorQuery): Promise<Row[]>;
147
+ close(): Promise<void>;
148
+ }
149
+ interface ConnectorFactory<C extends ConnectorConfig = ConnectorConfig> {
150
+ (config: C): BaseConnector;
151
+ }
152
+ declare function registerConnector(name: string, factory: ConnectorFactory): void;
153
+ declare function loadConnector<C extends ConnectorConfig = ConnectorConfig>(name: string, config: C): BaseConnector;
154
+ declare function listConnectors(): readonly string[];
155
+
156
+ /**
157
+ * snowflake.ts -- Snowflake connector via the optional `snowflake-sdk` peer dep.
158
+ */
159
+
160
+ interface SnowflakeConfig {
161
+ readonly account: string;
162
+ readonly username: string;
163
+ readonly password?: string;
164
+ readonly privateKey?: string;
165
+ readonly warehouse?: string;
166
+ readonly database?: string;
167
+ readonly schema?: string;
168
+ readonly role?: string;
169
+ }
170
+ declare function createSnowflakeConnector(config: SnowflakeConfig): BaseConnector;
171
+
172
+ /**
173
+ * bigquery.ts -- Google BigQuery connector via the optional `@google-cloud/bigquery` peer dep.
174
+ */
175
+
176
+ interface BigQueryConfig {
177
+ readonly projectId: string;
178
+ readonly keyFilename?: string;
179
+ readonly credentials?: Readonly<Record<string, unknown>>;
180
+ readonly dataset?: string;
181
+ readonly location?: string;
182
+ }
183
+ declare function createBigQueryConnector(config: BigQueryConfig): BaseConnector;
184
+
185
+ /**
186
+ * databricks.ts -- Databricks SQL warehouse connector via the optional `@databricks/sql` peer dep.
187
+ */
188
+
189
+ interface DatabricksConfig {
190
+ readonly serverHostname: string;
191
+ readonly httpPath: string;
192
+ readonly token: string;
193
+ readonly catalog?: string;
194
+ readonly schema?: string;
195
+ }
196
+ declare function createDatabricksConnector(config: DatabricksConfig): BaseConnector;
197
+
198
+ /**
199
+ * salesforce.ts -- Salesforce connector using the REST API via fetch().
200
+ *
201
+ * No SDK dependency: stays edge-adjacent. Supports either a pre-issued
202
+ * accessToken or the OAuth 2.0 password grant flow.
203
+ */
204
+
205
+ interface SalesforceConfig {
206
+ readonly instanceUrl: string;
207
+ readonly accessToken?: string;
208
+ readonly clientId?: string;
209
+ readonly clientSecret?: string;
210
+ readonly username?: string;
211
+ readonly password?: string;
212
+ readonly securityToken?: string;
213
+ readonly apiVersion?: string;
214
+ }
215
+ declare function createSalesforceConnector(config: SalesforceConfig): BaseConnector;
216
+
217
+ /**
218
+ * hubspot.ts -- HubSpot CRM connector via REST API + fetch().
219
+ *
220
+ * `query.table` selects the HubSpot object: "contacts", "companies", "deals", etc.
221
+ * SQL strings are not supported -- use object queries.
222
+ */
223
+
224
+ interface HubSpotConfig {
225
+ readonly apiKey: string;
226
+ readonly apiBase?: string;
227
+ }
228
+ declare function createHubSpotConnector(config: HubSpotConfig): BaseConnector;
229
+
230
+ /**
231
+ * mcp/server.ts -- GoldenMatch MCP server (stdio transport, JSON-RPC).
232
+ *
233
+ * Node-only: uses node:fs, node:path, node:readline. NOT edge-safe.
234
+ *
235
+ * Exposes ~20 tools covering dedupe, match, scoring, explanation,
236
+ * profiling, auto-config (shorthand), evaluation, and listings.
237
+ *
238
+ * Every tool dispatch is wrapped in try/catch so a single failure never
239
+ * crashes the JSON-RPC loop; errors come back as `{ error: "<msg>" }`.
240
+ *
241
+ * Ports ideas from goldenmatch/mcp/server.py.
242
+ */
243
+
244
+ interface Tool {
245
+ readonly name: string;
246
+ readonly description: string;
247
+ readonly inputSchema: Readonly<Record<string, unknown>>;
248
+ }
249
+ declare const TOOLS: readonly Tool[];
250
+ declare function handleTool(name: string, rawArgs: Record<string, unknown>): Promise<unknown>;
251
+ /**
252
+ * Start the MCP server reading JSON-RPC messages one per line from stdin
253
+ * and writing responses to stdout. Intended for Claude Desktop / any MCP
254
+ * client using stdio transport.
255
+ *
256
+ * Unknown methods return a JSON-RPC error. Bad JSON is logged to stderr
257
+ * (via console.warn) but does not crash the loop.
258
+ */
259
+ declare function startMcpServer(): void;
260
+
261
+ /**
262
+ * api/server.ts -- GoldenMatch REST API server (node:http).
263
+ *
264
+ * Node-only: uses node:http, node:path. NOT edge-safe.
265
+ *
266
+ * Endpoints:
267
+ * GET /health - liveness check
268
+ * POST /dedupe - dedupe a batch of rows (JSON body)
269
+ * POST /match - match target vs reference
270
+ * POST /score - score two strings
271
+ * POST /explain - explain a pair
272
+ * POST /profile - profile a batch of rows
273
+ * POST /clusters - return clusters from dedupe
274
+ * GET /reviews - list pending review items
275
+ * POST /reviews/decide - accept/reject a review item
276
+ *
277
+ * Ports ideas from goldenmatch/api/server.py.
278
+ */
279
+
280
+ interface ReviewItem {
281
+ readonly id: string;
282
+ readonly idA: number;
283
+ readonly idB: number;
284
+ readonly score: number;
285
+ readonly rowA: Row;
286
+ readonly rowB: Row;
287
+ status: "pending" | "accepted" | "rejected";
288
+ decidedAt?: string;
289
+ }
290
+ declare class ReviewQueue {
291
+ private items;
292
+ enqueue(item: Omit<ReviewItem, "status" | "id"> & {
293
+ id?: string;
294
+ }): ReviewItem;
295
+ pending(): ReviewItem[];
296
+ decide(id: string, accept: boolean): ReviewItem | null;
297
+ all(): ReviewItem[];
298
+ }
299
+ interface StartApiOptions {
300
+ readonly port?: number;
301
+ readonly host?: string;
302
+ }
303
+ /**
304
+ * Start the REST API server.
305
+ * Default: http://127.0.0.1:8000.
306
+ *
307
+ * Returns the http.Server so tests can close it.
308
+ */
309
+ declare function startApiServer(options?: StartApiOptions): ReturnType<typeof createServer>;
310
+
311
+ /**
312
+ * a2a/server.ts -- GoldenMatch A2A (Agent-to-Agent) protocol server.
313
+ *
314
+ * Node-only: uses node:http, node:crypto. NOT edge-safe.
315
+ *
316
+ * Endpoints:
317
+ * GET /.well-known/agent.json - agent card (10+ skills)
318
+ * POST /tasks - create a task (skill + input)
319
+ * GET /tasks/{id} - fetch task status/result
320
+ *
321
+ * Ports ideas from goldenmatch/a2a/server.py. This is a simpler
322
+ * synchronous variant (no SSE streaming, no persistent store).
323
+ */
324
+
325
+ interface AgentSkill {
326
+ readonly name: string;
327
+ readonly description: string;
328
+ readonly inputModes: readonly string[];
329
+ readonly outputModes: readonly string[];
330
+ }
331
+ declare const AGENT_CARD: {
332
+ readonly name: string;
333
+ readonly description: string;
334
+ readonly version: string;
335
+ readonly provider: {
336
+ readonly organization: string;
337
+ readonly url: string;
338
+ };
339
+ readonly capabilities: Readonly<Record<string, boolean>>;
340
+ readonly skills: readonly AgentSkill[];
341
+ };
342
+ interface StartA2aOptions {
343
+ readonly port?: number;
344
+ readonly host?: string;
345
+ }
346
+ declare function startA2aServer(options?: StartA2aOptions): ReturnType<typeof createServer>;
347
+
348
+ /**
349
+ * workers.ts -- Concurrent and parallel block scoring for Node.
350
+ *
351
+ * Python uses ThreadPoolExecutor (rapidfuzz releases the GIL). JS is
352
+ * single-threaded by default; true parallelism requires `worker_threads`
353
+ * with serialization overhead that's only worth it for large blocks.
354
+ *
355
+ * This module ships two schedulers:
356
+ * - `scoreBlocksConcurrent` -- Promise.all batching on the main thread.
357
+ * No real parallelism, but zero setup cost and good for small/medium
358
+ * block counts.
359
+ * - `scoreBlocksParallel` -- piscina-backed worker pool for true CPU
360
+ * parallelism. Optional peer dep; falls back to `scoreBlocksConcurrent`
361
+ * when piscina isn't installed.
362
+ *
363
+ * Mirrors the shape of `goldenmatch.backends.ray_backend.score_blocks_ray`
364
+ * from the Python source, but stays inside one Node process.
365
+ */
366
+
367
+ interface WorkerPoolOptions {
368
+ /** Max blocks scored concurrently per batch. Defaults to 4. */
369
+ readonly batchSize?: number;
370
+ }
371
+ interface ParallelWorkerOptions {
372
+ /** Max worker threads. Defaults to min(8, max(2, blocks.length)). */
373
+ readonly maxThreads?: number;
374
+ /** Min worker threads kept warm. Defaults to 1. */
375
+ readonly minThreads?: number;
376
+ /** Idle timeout in ms before workers exit. Defaults to 1000. */
377
+ readonly idleTimeout?: number;
378
+ }
379
+ /**
380
+ * Score blocks with cooperative concurrency.
381
+ *
382
+ * - For 0 blocks: returns empty.
383
+ * - For <= 2 blocks: skips batching overhead and runs sequentially via
384
+ * `scoreBlocksSequential` (mirrors Python's small-block fast path).
385
+ * - Otherwise: schedules blocks in batches of `batchSize`, awaiting each
386
+ * batch with `Promise.all` so the event loop can interleave I/O.
387
+ *
388
+ * Note: `matchedPairs` is mutated as new pairs are discovered (consistent
389
+ * with `scoreBlocksSequential`). A frozen snapshot is used per block so
390
+ * concurrent batches see a stable exclusion set, matching Python's
391
+ * `score_blocks_parallel` contract.
392
+ */
393
+ declare function scoreBlocksConcurrent(blocks: readonly BlockResult[], mk: MatchkeyConfig, matchedPairs: Set<PairKey>, options?: WorkerPoolOptions): Promise<readonly ScoredPair[]>;
394
+ /**
395
+ * Score blocks in true parallel via piscina worker_threads.
396
+ *
397
+ * - For 0 blocks: returns empty.
398
+ * - For <= 2 blocks: runs sequentially (spinning up workers isn't worth it).
399
+ * - Otherwise: dispatches each block to a piscina worker that runs
400
+ * `findFuzzyMatches` in its own V8 isolate, giving true CPU parallelism.
401
+ *
402
+ * Falls back to `scoreBlocksConcurrent` with a console warning if piscina
403
+ * isn't installed (it's an optional peer dep).
404
+ *
405
+ * `matchedPairs` is mutated in place with newly discovered pairs, matching
406
+ * the contract of `scoreBlocksSequential` / `scoreBlocksConcurrent`.
407
+ */
408
+ declare function scoreBlocksParallel(blocks: readonly BlockResult[], mk: MatchkeyConfig, matchedPairs: Set<PairKey>, options?: ParallelWorkerOptions): Promise<readonly ScoredPair[]>;
409
+
410
+ /**
411
+ * duckdb.ts -- Optional DuckDB connector for Node.
412
+ *
413
+ * Mirrors `goldenmatch.backends.duckdb_backend.DuckDBBackend` from Python.
414
+ *
415
+ * Peer dependency (NOT in package.json -- install on demand):
416
+ * npm install @duckdb/node-api
417
+ *
418
+ * The dep is loaded via `createRequire` so the package stays importable
419
+ * on edge runtimes and in environments without DuckDB.
420
+ */
421
+
422
+ interface DuckDBConfig {
423
+ /** Database path. Defaults to `:memory:`. */
424
+ readonly path?: string;
425
+ }
426
+ interface DuckDBConnector {
427
+ readTable(table: string): Promise<Row[]>;
428
+ readQuery(sql: string): Promise<Row[]>;
429
+ writeTable(table: string, rows: readonly Row[], schema?: Readonly<Record<string, string>>): Promise<void>;
430
+ listTables(): Promise<string[]>;
431
+ close(): void;
432
+ }
433
+ /**
434
+ * Create a DuckDB connector. Throws if `@duckdb/node-api` isn't installed.
435
+ *
436
+ * Async because the underlying DuckDB API is async-only (instance + connection
437
+ * setup both return Promises).
438
+ */
439
+ declare function createDuckDBConnector(config?: DuckDBConfig): Promise<DuckDBConnector>;
440
+
441
+ /**
442
+ * postgres.ts -- Optional Postgres connector for Node.
443
+ *
444
+ * Mirrors `goldenmatch.db.connector.PostgresConnector` from Python.
445
+ *
446
+ * Peer dependency (NOT in package.json -- install on demand):
447
+ * npm install pg
448
+ *
449
+ * The dep is loaded via `createRequire` so the package stays importable
450
+ * on edge runtimes and in environments without Postgres.
451
+ */
452
+
453
+ interface PostgresConfig {
454
+ readonly connectionString?: string;
455
+ readonly host?: string;
456
+ readonly port?: number;
457
+ readonly database?: string;
458
+ readonly user?: string;
459
+ readonly password?: string;
460
+ readonly ssl?: boolean;
461
+ }
462
+ interface PostgresWriteOptions {
463
+ readonly upsert?: boolean;
464
+ readonly primaryKey?: string;
465
+ }
466
+ interface PostgresConnector {
467
+ connect(): Promise<void>;
468
+ query<T = Row>(sql: string, params?: readonly unknown[]): Promise<T[]>;
469
+ readTable(table: string): Promise<Row[]>;
470
+ writeTable(table: string, rows: readonly Row[], options?: PostgresWriteOptions): Promise<void>;
471
+ listTables(schema?: string): Promise<string[]>;
472
+ close(): Promise<void>;
473
+ }
474
+ /**
475
+ * Create a Postgres connector. Throws if `pg` isn't installed.
476
+ *
477
+ * The returned connector requires `connect()` before any query. Inserts
478
+ * are batched in chunks of 1000 rows. When `options.upsert` is set, the
479
+ * write uses `INSERT ... ON CONFLICT (primaryKey) DO UPDATE`.
480
+ */
481
+ declare function createPostgresConnector(config: PostgresConfig): PostgresConnector;
482
+
483
+ /**
484
+ * sync.ts -- Postgres-backed dedupe sync + watch helpers.
485
+ *
486
+ * Mirrors `goldenmatch.db.sync` from Python: read source table, run
487
+ * dedupe, write golden + cluster tables back to Postgres.
488
+ */
489
+
490
+ interface SyncOptions {
491
+ readonly pg: PostgresConfig;
492
+ readonly sourceTable: string;
493
+ readonly goldenTable: string;
494
+ /** Optional table to write cluster summaries (cluster_id, members, size, ...). */
495
+ readonly clustersTable?: string;
496
+ readonly config: GoldenMatchConfig;
497
+ }
498
+ /**
499
+ * Run a single dedupe pass against Postgres.
500
+ *
501
+ * 1. Read all rows from `sourceTable`.
502
+ * 2. Dedupe via the core pipeline.
503
+ * 3. Write golden records to `goldenTable`.
504
+ * 4. Optionally write cluster summaries to `clustersTable`.
505
+ *
506
+ * Always closes the connection on the way out.
507
+ */
508
+ declare function syncDedupe(options: SyncOptions): Promise<DedupeResult>;
509
+ interface WatchSyncOptions extends SyncOptions {
510
+ /** Polling interval in ms. Defaults to 60_000 (1 minute). */
511
+ readonly intervalMs?: number;
512
+ }
513
+ /**
514
+ * Run `syncDedupe` on a recurring interval.
515
+ *
516
+ * Returns a `stop` function. Errors in any iteration are logged via
517
+ * `console.warn` so the loop keeps running; callers should monitor
518
+ * `onResult` to confirm forward progress.
519
+ */
520
+ declare function watchSync(options: WatchSyncOptions, onResult?: (result: DedupeResult) => void): Promise<() => void>;
521
+
522
+ /**
523
+ * app.ts -- GoldenMatch interactive TUI built on `ink` (React for CLIs).
524
+ *
525
+ * This module loads `ink` and `react` lazily via `createRequire` so the rest
526
+ * of the package stays usable without those optional peer dependencies.
527
+ *
528
+ * The UI mirrors the Python Textual TUI: 6 tabs (Data, Config, Matches,
529
+ * Golden, Boost, Export) with keyboard navigation [1..6], [Tab] to cycle,
530
+ * [r] to run dedupe, [q] / [Esc] to quit.
531
+ *
532
+ * Richer ink-ecosystem addons (ink-table, ink-select-input, ink-text-input,
533
+ * ink-spinner, ink-gradient) are optional peer deps loaded lazily via
534
+ * ./widgets.js. Each tab degrades gracefully to plain text when an addon is
535
+ * not installed.
536
+ *
537
+ * Implementation notes:
538
+ * - Uses React.createElement directly (no JSX) so we don't need a JSX
539
+ * transform in the existing tsup build.
540
+ * - The `ink` / `react` modules are typed as `any` at the boundary because
541
+ * they're optional peer deps; we don't want to require `@types/react`
542
+ * just to satisfy strict typecheck.
543
+ */
544
+
545
+ interface TuiOptions {
546
+ readonly files?: readonly string[];
547
+ readonly config?: GoldenMatchConfig;
548
+ }
549
+ /**
550
+ * Launch the GoldenMatch TUI. Resolves once the user quits.
551
+ */
552
+ declare function startTui(options?: TuiOptions): Promise<void>;
553
+
554
+ export { AGENT_CARD, type AgentSkill, type BaseConnector, type BigQueryConfig, BlockResult, type ConnectorConfig, type ConnectorFactory, type ConnectorQuery, type DatabricksConfig, DedupeResult, type DuckDBConfig, type DuckDBConnector, type FileDedupeOptions, type FileSpec, GoldenMatchConfig, type HubSpotConfig, MatchResult, MatchkeyConfig, PairKey, type ParallelWorkerOptions, type PostgresConfig, type PostgresConnector, type PostgresWriteOptions, type ReadCsvOptions, ReviewQueue, Row, type SalesforceConfig, ScoredPair, type SnowflakeConfig, type StartA2aOptions, type StartApiOptions, type SyncOptions, TOOLS, type TuiOptions, type WatchSyncOptions, type WorkerPoolOptions, type WriteCsvOptions, createBigQueryConnector, createDatabricksConnector, createDuckDBConnector, createHubSpotConnector, createPostgresConnector, createSalesforceConnector, createSnowflakeConnector, dedupeFile, handleTool, listConnectors, loadConfigFile, loadConnector, matchFiles, readCsv, readFile, readJson, registerConnector, scoreBlocksConcurrent, scoreBlocksParallel, startA2aServer, startApiServer, startMcpServer, startTui, syncDedupe, watchSync, writeConfigFile, writeCsv, writeJson };