goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
import { R as Row, G as GoldenMatchConfig, D as DedupeResult, a as MatchResult, B as BlockResult, M as MatchkeyConfig, P as PairKey, S as ScoredPair } from '../types-DhUdX5Rc.js';
|
|
2
|
+
export { b as BlockingConfig, c as BlockingKeyConfig, d as BudgetConfig, C as CanopyConfig, e as ClusterInfo, f as ClusterProvenance, g as ColumnValue, h as DedupeStats, i as DomainConfig, E as ExactMatchkey, F as FieldProvenance, j as GoldenFieldRule, k as GoldenRulesConfig, I as InputConfig, l as InputFileConfig, L as LLMScorerConfig, m as LearningConfig, n as MakeMatchkeyConfigInput, o as MatchkeyField, p as MemoryConfig, O as OutputConfig, q as ProbabilisticMatchkey, Q as QualityConfig, r as SortKeyField, s as StandardizationConfig, T as TransformConfig, V as VALID_SCORERS, t as VALID_STANDARDIZERS, u as VALID_STRATEGIES, v as VALID_TRANSFORMS, w as ValidationConfig, x as ValidationRuleConfig, W as WeightedMatchkey, y as getMatchkeys, z as makeBlockingConfig, A as makeConfig, H as makeGoldenRulesConfig, J as makeMatchkeyConfig, K as makeMatchkeyField, N as makeScoredPair } from '../types-DhUdX5Rc.js';
|
|
3
|
+
export { ANNBlocker, ANNBlockerBase, ANNBlockerOptions, AutoFixLog, AutoconfigOptions, BudgetSnapshot, BudgetTracker, BuildANNOptions, CCMSResult, ClusterExplanation, ColumnProfile, Correction, CreateANNBlockerOptions, CrossEncoderHttpError, CrossEncoderModel, CrossEncoderModelOptions, CrossEncoderOptions, CrossEncoderProvider, CrossEncoderReranker, DatasetProfile, DomainProfile, EMResult, Embedder, EmbedderError, EmbedderOptions, EmbedderProvider, EmbeddingResult, EvalResult, GatedResult, GraphERResult, HNSWANNBlocker, HNSWIndexLike, HNSWModule, HNSWOptions, LLMScoreResult, LearnedParams, LearnedPredicate, LearnedRules, LineageBundle, LineageEdge, MemoryLearner, MemoryStore, MemoryStoreConfig, PPRLConfig, PPRLResult, PairExplanation, QualityFinding, Relationship, ReviewItem, SensitivityResult, StreamProcessor, SweepParam, SweepPoint, TableSchema, TabularData, UnionFind, ValidationReport, ValidationRule, _resetCrossEncoderModelCache, addRowIds, addSourceColumn, addToCluster, applyColumnMap, applyCorrections, applyLearnedBlocks, applyStandardization, applyStandardizer, applyTransform, applyTransforms, asString, autoConfigurePPRL, autoConfigureRows, autoFixRows, buildANNBlocks, buildANNPairBlocks, buildAdaptiveBlocks, buildBlocks, buildBlocksAsync, buildClusters, buildComparisonVector, buildGoldenRecord, buildGoldenRecordWithProvenance, buildLineage, buildMst, buildMultiPassBlocks, buildStaticBlocks, compareClusters, computeClusterConfidence, computeMatchkeyValue, computeMatchkeys, concatRows, configToYaml, cosineSim, countTokensApprox, createANNBlocker, dedupe, detectDomain, diceCoefficient, ensembleScore, euclideanDist, evaluateClusters, evaluatePairs, explainCluster, explainPair, extractFeatures, findExactMatches, findExactMatchesOne, findFuzzyMatches, gatePairs, getClusterPairScores, getEmbedder, hashRow, indelDistance, indelSimilarity, isNullish, jaccardSimilarity, jaro, jaroWinkler, learnBlockingRules, levenshteinDistance, levenshteinSimilarity, lineageFromJson, lineageToJson, llmClusterPairs, llmScorePairs, loadGroundTruthPairs, match, matchOne, mergeField, metaphone, pairKey, parseConfig, parseConfigYaml, parsePairKey, profileRows, rerankPair, rerankTopPairs, runDedupePipeline, runGraphER, runMatchPipeline, runPPRL, runQualityCheck, runSensitivity, scanQuality, scoreBlocksSequential, scoreField, scoreMatrix, scorePair, scorePairRecord, scoreProbabilistic, scoreStrings, scoreStringsWithLlm, selectBestBlockingKey, soundex, soundexMatch, splitOversizedCluster, stabilityReport, toColumnValue, tokenSortRatio, trainEM, unmergeCluster, unmergeRecord, validateColumns, validateRows } from '../core/index.js';
|
|
4
|
+
import { createServer } from 'node:http';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* file.ts -- CSV/JSON/JSONL file I/O connector.
|
|
8
|
+
*
|
|
9
|
+
* Node-only: uses node:fs, node:path. NOT edge-safe.
|
|
10
|
+
*
|
|
11
|
+
* CSV parser rules (CRITICAL):
|
|
12
|
+
* - Quoted fields preserve embedded commas and newlines
|
|
13
|
+
* - Doubled quotes inside quoted fields unescape to a single quote
|
|
14
|
+
* - Empty unquoted fields become null
|
|
15
|
+
* - Leading-zero strings (zip codes "01234", SSNs, phones) are NEVER
|
|
16
|
+
* coerced to numbers
|
|
17
|
+
* - Booleans "true"/"false" (case-insensitive) coerce to boolean
|
|
18
|
+
* - Numeric strings coerce to number only when fully numeric and not
|
|
19
|
+
* leading-zero
|
|
20
|
+
* - Supports both `\n` and `\r\n` line endings
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
interface ReadCsvOptions {
|
|
24
|
+
readonly delimiter?: string;
|
|
25
|
+
readonly hasHeader?: boolean;
|
|
26
|
+
readonly encoding?: BufferEncoding;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Read a CSV (or TSV, via `delimiter: "\t"`) file from disk.
|
|
30
|
+
*
|
|
31
|
+
* Returns an array of `Row` objects (header -> coerced value). If
|
|
32
|
+
* `hasHeader` is false, synthetic headers `col_0`, `col_1`, ... are used.
|
|
33
|
+
*/
|
|
34
|
+
declare function readCsv(path: string, options?: ReadCsvOptions): Row[];
|
|
35
|
+
/**
|
|
36
|
+
* Read a JSON or JSONL file.
|
|
37
|
+
*
|
|
38
|
+
* - `.json`: expects an array of objects at the top level.
|
|
39
|
+
* - `.jsonl` / `.ndjson`: one JSON object per line.
|
|
40
|
+
*
|
|
41
|
+
* Auto-detected based on whether the first non-whitespace character is `[`.
|
|
42
|
+
*/
|
|
43
|
+
declare function readJson(path: string): Row[];
|
|
44
|
+
/**
|
|
45
|
+
* Dispatch to readCsv / readJson based on file extension.
|
|
46
|
+
*
|
|
47
|
+
* Supported: `.csv`, `.tsv`, `.json`, `.jsonl`, `.ndjson`.
|
|
48
|
+
*/
|
|
49
|
+
declare function readFile(path: string): Row[];
|
|
50
|
+
interface WriteCsvOptions {
|
|
51
|
+
readonly columns?: readonly string[];
|
|
52
|
+
readonly delimiter?: string;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Write rows to a CSV file. Creates parent directories as needed.
|
|
56
|
+
*
|
|
57
|
+
* If `columns` is not supplied, the union of keys from all rows is used,
|
|
58
|
+
* ordered by first appearance.
|
|
59
|
+
*/
|
|
60
|
+
declare function writeCsv(path: string, rows: readonly Row[], options?: WriteCsvOptions): void;
|
|
61
|
+
/** Write rows to a JSON file as a pretty-printed array. */
|
|
62
|
+
declare function writeJson(path: string, rows: readonly Row[]): void;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* dedupe-file.ts -- File-based convenience wrappers around dedupe() / match().
|
|
66
|
+
*
|
|
67
|
+
* Node-only: reads from disk, tags rows with __source__, delegates to the
|
|
68
|
+
* edge-safe core API.
|
|
69
|
+
*/
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* File specification. Either a bare path (source name is derived from the
|
|
73
|
+
* file's basename without extension) or a tuple `[path, sourceName]`.
|
|
74
|
+
*/
|
|
75
|
+
type FileSpec = string | readonly [string, string];
|
|
76
|
+
interface FileDedupeOptions {
|
|
77
|
+
/** Input files. Required when calling `dedupeFile(opts)`. */
|
|
78
|
+
readonly files?: readonly FileSpec[];
|
|
79
|
+
/** Full config -- takes precedence over shorthand fields below. */
|
|
80
|
+
readonly config?: GoldenMatchConfig;
|
|
81
|
+
readonly exact?: readonly string[];
|
|
82
|
+
readonly fuzzy?: Readonly<Record<string, number>>;
|
|
83
|
+
readonly blocking?: readonly string[];
|
|
84
|
+
readonly threshold?: number;
|
|
85
|
+
/** Enable LLM scorer for borderline pairs (not yet implemented in JS). */
|
|
86
|
+
readonly llmScorer?: boolean;
|
|
87
|
+
/** Write golden records to this path (.csv or .json). */
|
|
88
|
+
readonly outputPath?: string;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Deduplicate records across one or more files.
|
|
92
|
+
*
|
|
93
|
+
* Each file's rows are tagged with `__source__ = <sourceName>` before being
|
|
94
|
+
* concatenated and passed to `dedupe()`.
|
|
95
|
+
*
|
|
96
|
+
* @throws if no files are provided or any file cannot be read.
|
|
97
|
+
*/
|
|
98
|
+
declare function dedupeFile(opts: FileDedupeOptions): DedupeResult;
|
|
99
|
+
/**
|
|
100
|
+
* Match target records against a reference file.
|
|
101
|
+
*
|
|
102
|
+
* Reads both files, tags with `__source__`, and delegates to `match()`.
|
|
103
|
+
* `opts.files` (if provided) is ignored in favor of the explicit paths.
|
|
104
|
+
*/
|
|
105
|
+
declare function matchFiles(targetPath: string, referencePath: string, opts?: FileDedupeOptions): MatchResult;
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* config-file.ts -- YAML config loading/saving from disk.
|
|
109
|
+
*
|
|
110
|
+
* Node-only. Uses `createRequire` so the optional `yaml` peer dependency
|
|
111
|
+
* is resolved lazily without breaking edge-safe ESM builds.
|
|
112
|
+
*/
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Load and parse a YAML config file into a typed GoldenMatchConfig.
|
|
116
|
+
*
|
|
117
|
+
* @throws if the file cannot be read, `yaml` is not installed, or the
|
|
118
|
+
* document does not describe a valid config.
|
|
119
|
+
*/
|
|
120
|
+
declare function loadConfigFile(path: string): GoldenMatchConfig;
|
|
121
|
+
/**
|
|
122
|
+
* Serialize a GoldenMatchConfig to YAML and write it to disk.
|
|
123
|
+
* Creates parent directories as needed.
|
|
124
|
+
*/
|
|
125
|
+
declare function writeConfigFile(path: string, config: GoldenMatchConfig): void;
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* base.ts -- Base connector interface and registry.
|
|
129
|
+
*
|
|
130
|
+
* Mirrors goldenmatch.connectors.base from the Python package: a small
|
|
131
|
+
* abstraction over external data sources (Snowflake, BigQuery, etc.) that
|
|
132
|
+
* exposes connect/read/close lifecycle and a name-based registry.
|
|
133
|
+
*/
|
|
134
|
+
|
|
135
|
+
interface ConnectorConfig {
|
|
136
|
+
readonly [key: string]: unknown;
|
|
137
|
+
}
|
|
138
|
+
interface ConnectorQuery {
|
|
139
|
+
readonly table: string;
|
|
140
|
+
readonly columns?: readonly string[];
|
|
141
|
+
readonly limit?: number;
|
|
142
|
+
}
|
|
143
|
+
interface BaseConnector {
|
|
144
|
+
readonly name: string;
|
|
145
|
+
connect(): Promise<void>;
|
|
146
|
+
read(query: string | ConnectorQuery): Promise<Row[]>;
|
|
147
|
+
close(): Promise<void>;
|
|
148
|
+
}
|
|
149
|
+
interface ConnectorFactory<C extends ConnectorConfig = ConnectorConfig> {
|
|
150
|
+
(config: C): BaseConnector;
|
|
151
|
+
}
|
|
152
|
+
declare function registerConnector(name: string, factory: ConnectorFactory): void;
|
|
153
|
+
declare function loadConnector<C extends ConnectorConfig = ConnectorConfig>(name: string, config: C): BaseConnector;
|
|
154
|
+
declare function listConnectors(): readonly string[];
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* snowflake.ts -- Snowflake connector via the optional `snowflake-sdk` peer dep.
|
|
158
|
+
*/
|
|
159
|
+
|
|
160
|
+
interface SnowflakeConfig {
|
|
161
|
+
readonly account: string;
|
|
162
|
+
readonly username: string;
|
|
163
|
+
readonly password?: string;
|
|
164
|
+
readonly privateKey?: string;
|
|
165
|
+
readonly warehouse?: string;
|
|
166
|
+
readonly database?: string;
|
|
167
|
+
readonly schema?: string;
|
|
168
|
+
readonly role?: string;
|
|
169
|
+
}
|
|
170
|
+
declare function createSnowflakeConnector(config: SnowflakeConfig): BaseConnector;
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* bigquery.ts -- Google BigQuery connector via the optional `@google-cloud/bigquery` peer dep.
|
|
174
|
+
*/
|
|
175
|
+
|
|
176
|
+
interface BigQueryConfig {
|
|
177
|
+
readonly projectId: string;
|
|
178
|
+
readonly keyFilename?: string;
|
|
179
|
+
readonly credentials?: Readonly<Record<string, unknown>>;
|
|
180
|
+
readonly dataset?: string;
|
|
181
|
+
readonly location?: string;
|
|
182
|
+
}
|
|
183
|
+
declare function createBigQueryConnector(config: BigQueryConfig): BaseConnector;
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* databricks.ts -- Databricks SQL warehouse connector via the optional `@databricks/sql` peer dep.
|
|
187
|
+
*/
|
|
188
|
+
|
|
189
|
+
interface DatabricksConfig {
|
|
190
|
+
readonly serverHostname: string;
|
|
191
|
+
readonly httpPath: string;
|
|
192
|
+
readonly token: string;
|
|
193
|
+
readonly catalog?: string;
|
|
194
|
+
readonly schema?: string;
|
|
195
|
+
}
|
|
196
|
+
declare function createDatabricksConnector(config: DatabricksConfig): BaseConnector;
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* salesforce.ts -- Salesforce connector using the REST API via fetch().
|
|
200
|
+
*
|
|
201
|
+
* No SDK dependency: stays edge-adjacent. Supports either a pre-issued
|
|
202
|
+
* accessToken or the OAuth 2.0 password grant flow.
|
|
203
|
+
*/
|
|
204
|
+
|
|
205
|
+
interface SalesforceConfig {
|
|
206
|
+
readonly instanceUrl: string;
|
|
207
|
+
readonly accessToken?: string;
|
|
208
|
+
readonly clientId?: string;
|
|
209
|
+
readonly clientSecret?: string;
|
|
210
|
+
readonly username?: string;
|
|
211
|
+
readonly password?: string;
|
|
212
|
+
readonly securityToken?: string;
|
|
213
|
+
readonly apiVersion?: string;
|
|
214
|
+
}
|
|
215
|
+
declare function createSalesforceConnector(config: SalesforceConfig): BaseConnector;
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* hubspot.ts -- HubSpot CRM connector via REST API + fetch().
|
|
219
|
+
*
|
|
220
|
+
* `query.table` selects the HubSpot object: "contacts", "companies", "deals", etc.
|
|
221
|
+
* SQL strings are not supported -- use object queries.
|
|
222
|
+
*/
|
|
223
|
+
|
|
224
|
+
interface HubSpotConfig {
|
|
225
|
+
readonly apiKey: string;
|
|
226
|
+
readonly apiBase?: string;
|
|
227
|
+
}
|
|
228
|
+
declare function createHubSpotConnector(config: HubSpotConfig): BaseConnector;
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* mcp/server.ts -- GoldenMatch MCP server (stdio transport, JSON-RPC).
|
|
232
|
+
*
|
|
233
|
+
* Node-only: uses node:fs, node:path, node:readline. NOT edge-safe.
|
|
234
|
+
*
|
|
235
|
+
* Exposes ~20 tools covering dedupe, match, scoring, explanation,
|
|
236
|
+
* profiling, auto-config (shorthand), evaluation, and listings.
|
|
237
|
+
*
|
|
238
|
+
* Every tool dispatch is wrapped in try/catch so a single failure never
|
|
239
|
+
* crashes the JSON-RPC loop; errors come back as `{ error: "<msg>" }`.
|
|
240
|
+
*
|
|
241
|
+
* Ports ideas from goldenmatch/mcp/server.py.
|
|
242
|
+
*/
|
|
243
|
+
|
|
244
|
+
interface Tool {
|
|
245
|
+
readonly name: string;
|
|
246
|
+
readonly description: string;
|
|
247
|
+
readonly inputSchema: Readonly<Record<string, unknown>>;
|
|
248
|
+
}
|
|
249
|
+
declare const TOOLS: readonly Tool[];
|
|
250
|
+
declare function handleTool(name: string, rawArgs: Record<string, unknown>): Promise<unknown>;
|
|
251
|
+
/**
|
|
252
|
+
* Start the MCP server reading JSON-RPC messages one per line from stdin
|
|
253
|
+
* and writing responses to stdout. Intended for Claude Desktop / any MCP
|
|
254
|
+
* client using stdio transport.
|
|
255
|
+
*
|
|
256
|
+
* Unknown methods return a JSON-RPC error. Bad JSON is logged to stderr
|
|
257
|
+
* (via console.warn) but does not crash the loop.
|
|
258
|
+
*/
|
|
259
|
+
declare function startMcpServer(): void;
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* api/server.ts -- GoldenMatch REST API server (node:http).
|
|
263
|
+
*
|
|
264
|
+
* Node-only: uses node:http, node:path. NOT edge-safe.
|
|
265
|
+
*
|
|
266
|
+
* Endpoints:
|
|
267
|
+
* GET /health - liveness check
|
|
268
|
+
* POST /dedupe - dedupe a batch of rows (JSON body)
|
|
269
|
+
* POST /match - match target vs reference
|
|
270
|
+
* POST /score - score two strings
|
|
271
|
+
* POST /explain - explain a pair
|
|
272
|
+
* POST /profile - profile a batch of rows
|
|
273
|
+
* POST /clusters - return clusters from dedupe
|
|
274
|
+
* GET /reviews - list pending review items
|
|
275
|
+
* POST /reviews/decide - accept/reject a review item
|
|
276
|
+
*
|
|
277
|
+
* Ports ideas from goldenmatch/api/server.py.
|
|
278
|
+
*/
|
|
279
|
+
|
|
280
|
+
interface ReviewItem {
|
|
281
|
+
readonly id: string;
|
|
282
|
+
readonly idA: number;
|
|
283
|
+
readonly idB: number;
|
|
284
|
+
readonly score: number;
|
|
285
|
+
readonly rowA: Row;
|
|
286
|
+
readonly rowB: Row;
|
|
287
|
+
status: "pending" | "accepted" | "rejected";
|
|
288
|
+
decidedAt?: string;
|
|
289
|
+
}
|
|
290
|
+
declare class ReviewQueue {
|
|
291
|
+
private items;
|
|
292
|
+
enqueue(item: Omit<ReviewItem, "status" | "id"> & {
|
|
293
|
+
id?: string;
|
|
294
|
+
}): ReviewItem;
|
|
295
|
+
pending(): ReviewItem[];
|
|
296
|
+
decide(id: string, accept: boolean): ReviewItem | null;
|
|
297
|
+
all(): ReviewItem[];
|
|
298
|
+
}
|
|
299
|
+
interface StartApiOptions {
|
|
300
|
+
readonly port?: number;
|
|
301
|
+
readonly host?: string;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Start the REST API server.
|
|
305
|
+
* Default: http://127.0.0.1:8000.
|
|
306
|
+
*
|
|
307
|
+
* Returns the http.Server so tests can close it.
|
|
308
|
+
*/
|
|
309
|
+
declare function startApiServer(options?: StartApiOptions): ReturnType<typeof createServer>;
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* a2a/server.ts -- GoldenMatch A2A (Agent-to-Agent) protocol server.
|
|
313
|
+
*
|
|
314
|
+
* Node-only: uses node:http, node:crypto. NOT edge-safe.
|
|
315
|
+
*
|
|
316
|
+
* Endpoints:
|
|
317
|
+
* GET /.well-known/agent.json - agent card (10+ skills)
|
|
318
|
+
* POST /tasks - create a task (skill + input)
|
|
319
|
+
* GET /tasks/{id} - fetch task status/result
|
|
320
|
+
*
|
|
321
|
+
* Ports ideas from goldenmatch/a2a/server.py. This is a simpler
|
|
322
|
+
* synchronous variant (no SSE streaming, no persistent store).
|
|
323
|
+
*/
|
|
324
|
+
|
|
325
|
+
interface AgentSkill {
|
|
326
|
+
readonly name: string;
|
|
327
|
+
readonly description: string;
|
|
328
|
+
readonly inputModes: readonly string[];
|
|
329
|
+
readonly outputModes: readonly string[];
|
|
330
|
+
}
|
|
331
|
+
declare const AGENT_CARD: {
|
|
332
|
+
readonly name: string;
|
|
333
|
+
readonly description: string;
|
|
334
|
+
readonly version: string;
|
|
335
|
+
readonly provider: {
|
|
336
|
+
readonly organization: string;
|
|
337
|
+
readonly url: string;
|
|
338
|
+
};
|
|
339
|
+
readonly capabilities: Readonly<Record<string, boolean>>;
|
|
340
|
+
readonly skills: readonly AgentSkill[];
|
|
341
|
+
};
|
|
342
|
+
interface StartA2aOptions {
|
|
343
|
+
readonly port?: number;
|
|
344
|
+
readonly host?: string;
|
|
345
|
+
}
|
|
346
|
+
declare function startA2aServer(options?: StartA2aOptions): ReturnType<typeof createServer>;
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* workers.ts -- Concurrent and parallel block scoring for Node.
|
|
350
|
+
*
|
|
351
|
+
* Python uses ThreadPoolExecutor (rapidfuzz releases the GIL). JS is
|
|
352
|
+
* single-threaded by default; true parallelism requires `worker_threads`
|
|
353
|
+
* with serialization overhead that's only worth it for large blocks.
|
|
354
|
+
*
|
|
355
|
+
* This module ships two schedulers:
|
|
356
|
+
* - `scoreBlocksConcurrent` -- Promise.all batching on the main thread.
|
|
357
|
+
* No real parallelism, but zero setup cost and good for small/medium
|
|
358
|
+
* block counts.
|
|
359
|
+
* - `scoreBlocksParallel` -- piscina-backed worker pool for true CPU
|
|
360
|
+
* parallelism. Optional peer dep; falls back to `scoreBlocksConcurrent`
|
|
361
|
+
* when piscina isn't installed.
|
|
362
|
+
*
|
|
363
|
+
* Mirrors the shape of `goldenmatch.backends.ray_backend.score_blocks_ray`
|
|
364
|
+
* from the Python source, but stays inside one Node process.
|
|
365
|
+
*/
|
|
366
|
+
|
|
367
|
+
interface WorkerPoolOptions {
|
|
368
|
+
/** Max blocks scored concurrently per batch. Defaults to 4. */
|
|
369
|
+
readonly batchSize?: number;
|
|
370
|
+
}
|
|
371
|
+
interface ParallelWorkerOptions {
|
|
372
|
+
/** Max worker threads. Defaults to min(8, max(2, blocks.length)). */
|
|
373
|
+
readonly maxThreads?: number;
|
|
374
|
+
/** Min worker threads kept warm. Defaults to 1. */
|
|
375
|
+
readonly minThreads?: number;
|
|
376
|
+
/** Idle timeout in ms before workers exit. Defaults to 1000. */
|
|
377
|
+
readonly idleTimeout?: number;
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Score blocks with cooperative concurrency.
|
|
381
|
+
*
|
|
382
|
+
* - For 0 blocks: returns empty.
|
|
383
|
+
* - For <= 2 blocks: skips batching overhead and runs sequentially via
|
|
384
|
+
* `scoreBlocksSequential` (mirrors Python's small-block fast path).
|
|
385
|
+
* - Otherwise: schedules blocks in batches of `batchSize`, awaiting each
|
|
386
|
+
* batch with `Promise.all` so the event loop can interleave I/O.
|
|
387
|
+
*
|
|
388
|
+
* Note: `matchedPairs` is mutated as new pairs are discovered (consistent
|
|
389
|
+
* with `scoreBlocksSequential`). A frozen snapshot is used per block so
|
|
390
|
+
* concurrent batches see a stable exclusion set, matching Python's
|
|
391
|
+
* `score_blocks_parallel` contract.
|
|
392
|
+
*/
|
|
393
|
+
declare function scoreBlocksConcurrent(blocks: readonly BlockResult[], mk: MatchkeyConfig, matchedPairs: Set<PairKey>, options?: WorkerPoolOptions): Promise<readonly ScoredPair[]>;
|
|
394
|
+
/**
|
|
395
|
+
* Score blocks in true parallel via piscina worker_threads.
|
|
396
|
+
*
|
|
397
|
+
* - For 0 blocks: returns empty.
|
|
398
|
+
* - For <= 2 blocks: runs sequentially (spinning up workers isn't worth it).
|
|
399
|
+
* - Otherwise: dispatches each block to a piscina worker that runs
|
|
400
|
+
* `findFuzzyMatches` in its own V8 isolate, giving true CPU parallelism.
|
|
401
|
+
*
|
|
402
|
+
* Falls back to `scoreBlocksConcurrent` with a console warning if piscina
|
|
403
|
+
* isn't installed (it's an optional peer dep).
|
|
404
|
+
*
|
|
405
|
+
* `matchedPairs` is mutated in place with newly discovered pairs, matching
|
|
406
|
+
* the contract of `scoreBlocksSequential` / `scoreBlocksConcurrent`.
|
|
407
|
+
*/
|
|
408
|
+
declare function scoreBlocksParallel(blocks: readonly BlockResult[], mk: MatchkeyConfig, matchedPairs: Set<PairKey>, options?: ParallelWorkerOptions): Promise<readonly ScoredPair[]>;
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* duckdb.ts -- Optional DuckDB connector for Node.
|
|
412
|
+
*
|
|
413
|
+
* Mirrors `goldenmatch.backends.duckdb_backend.DuckDBBackend` from Python.
|
|
414
|
+
*
|
|
415
|
+
* Peer dependency (NOT in package.json -- install on demand):
|
|
416
|
+
* npm install @duckdb/node-api
|
|
417
|
+
*
|
|
418
|
+
* The dep is loaded via `createRequire` so the package stays importable
|
|
419
|
+
* on edge runtimes and in environments without DuckDB.
|
|
420
|
+
*/
|
|
421
|
+
|
|
422
|
+
interface DuckDBConfig {
|
|
423
|
+
/** Database path. Defaults to `:memory:`. */
|
|
424
|
+
readonly path?: string;
|
|
425
|
+
}
|
|
426
|
+
interface DuckDBConnector {
|
|
427
|
+
readTable(table: string): Promise<Row[]>;
|
|
428
|
+
readQuery(sql: string): Promise<Row[]>;
|
|
429
|
+
writeTable(table: string, rows: readonly Row[], schema?: Readonly<Record<string, string>>): Promise<void>;
|
|
430
|
+
listTables(): Promise<string[]>;
|
|
431
|
+
close(): void;
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Create a DuckDB connector. Throws if `@duckdb/node-api` isn't installed.
|
|
435
|
+
*
|
|
436
|
+
* Async because the underlying DuckDB API is async-only (instance + connection
|
|
437
|
+
* setup both return Promises).
|
|
438
|
+
*/
|
|
439
|
+
declare function createDuckDBConnector(config?: DuckDBConfig): Promise<DuckDBConnector>;
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* postgres.ts -- Optional Postgres connector for Node.
|
|
443
|
+
*
|
|
444
|
+
* Mirrors `goldenmatch.db.connector.PostgresConnector` from Python.
|
|
445
|
+
*
|
|
446
|
+
* Peer dependency (NOT in package.json -- install on demand):
|
|
447
|
+
* npm install pg
|
|
448
|
+
*
|
|
449
|
+
* The dep is loaded via `createRequire` so the package stays importable
|
|
450
|
+
* on edge runtimes and in environments without Postgres.
|
|
451
|
+
*/
|
|
452
|
+
|
|
453
|
+
interface PostgresConfig {
|
|
454
|
+
readonly connectionString?: string;
|
|
455
|
+
readonly host?: string;
|
|
456
|
+
readonly port?: number;
|
|
457
|
+
readonly database?: string;
|
|
458
|
+
readonly user?: string;
|
|
459
|
+
readonly password?: string;
|
|
460
|
+
readonly ssl?: boolean;
|
|
461
|
+
}
|
|
462
|
+
interface PostgresWriteOptions {
|
|
463
|
+
readonly upsert?: boolean;
|
|
464
|
+
readonly primaryKey?: string;
|
|
465
|
+
}
|
|
466
|
+
interface PostgresConnector {
|
|
467
|
+
connect(): Promise<void>;
|
|
468
|
+
query<T = Row>(sql: string, params?: readonly unknown[]): Promise<T[]>;
|
|
469
|
+
readTable(table: string): Promise<Row[]>;
|
|
470
|
+
writeTable(table: string, rows: readonly Row[], options?: PostgresWriteOptions): Promise<void>;
|
|
471
|
+
listTables(schema?: string): Promise<string[]>;
|
|
472
|
+
close(): Promise<void>;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* Create a Postgres connector. Throws if `pg` isn't installed.
|
|
476
|
+
*
|
|
477
|
+
* The returned connector requires `connect()` before any query. Inserts
|
|
478
|
+
* are batched in chunks of 1000 rows. When `options.upsert` is set, the
|
|
479
|
+
* write uses `INSERT ... ON CONFLICT (primaryKey) DO UPDATE`.
|
|
480
|
+
*/
|
|
481
|
+
declare function createPostgresConnector(config: PostgresConfig): PostgresConnector;
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* sync.ts -- Postgres-backed dedupe sync + watch helpers.
|
|
485
|
+
*
|
|
486
|
+
* Mirrors `goldenmatch.db.sync` from Python: read source table, run
|
|
487
|
+
* dedupe, write golden + cluster tables back to Postgres.
|
|
488
|
+
*/
|
|
489
|
+
|
|
490
|
+
interface SyncOptions {
|
|
491
|
+
readonly pg: PostgresConfig;
|
|
492
|
+
readonly sourceTable: string;
|
|
493
|
+
readonly goldenTable: string;
|
|
494
|
+
/** Optional table to write cluster summaries (cluster_id, members, size, ...). */
|
|
495
|
+
readonly clustersTable?: string;
|
|
496
|
+
readonly config: GoldenMatchConfig;
|
|
497
|
+
}
|
|
498
|
+
/**
|
|
499
|
+
* Run a single dedupe pass against Postgres.
|
|
500
|
+
*
|
|
501
|
+
* 1. Read all rows from `sourceTable`.
|
|
502
|
+
* 2. Dedupe via the core pipeline.
|
|
503
|
+
* 3. Write golden records to `goldenTable`.
|
|
504
|
+
* 4. Optionally write cluster summaries to `clustersTable`.
|
|
505
|
+
*
|
|
506
|
+
* Always closes the connection on the way out.
|
|
507
|
+
*/
|
|
508
|
+
declare function syncDedupe(options: SyncOptions): Promise<DedupeResult>;
|
|
509
|
+
interface WatchSyncOptions extends SyncOptions {
|
|
510
|
+
/** Polling interval in ms. Defaults to 60_000 (1 minute). */
|
|
511
|
+
readonly intervalMs?: number;
|
|
512
|
+
}
|
|
513
|
+
/**
|
|
514
|
+
* Run `syncDedupe` on a recurring interval.
|
|
515
|
+
*
|
|
516
|
+
* Returns a `stop` function. Errors in any iteration are logged via
|
|
517
|
+
* `console.warn` so the loop keeps running; callers should monitor
|
|
518
|
+
* `onResult` to confirm forward progress.
|
|
519
|
+
*/
|
|
520
|
+
declare function watchSync(options: WatchSyncOptions, onResult?: (result: DedupeResult) => void): Promise<() => void>;
|
|
521
|
+
|
|
522
|
+
/**
|
|
523
|
+
* app.ts -- GoldenMatch interactive TUI built on `ink` (React for CLIs).
|
|
524
|
+
*
|
|
525
|
+
* This module loads `ink` and `react` lazily via `createRequire` so the rest
|
|
526
|
+
* of the package stays usable without those optional peer dependencies.
|
|
527
|
+
*
|
|
528
|
+
* The UI mirrors the Python Textual TUI: 6 tabs (Data, Config, Matches,
|
|
529
|
+
* Golden, Boost, Export) with keyboard navigation [1..6], [Tab] to cycle,
|
|
530
|
+
* [r] to run dedupe, [q] / [Esc] to quit.
|
|
531
|
+
*
|
|
532
|
+
* Richer ink-ecosystem addons (ink-table, ink-select-input, ink-text-input,
|
|
533
|
+
* ink-spinner, ink-gradient) are optional peer deps loaded lazily via
|
|
534
|
+
* ./widgets.js. Each tab degrades gracefully to plain text when an addon is
|
|
535
|
+
* not installed.
|
|
536
|
+
*
|
|
537
|
+
* Implementation notes:
|
|
538
|
+
* - Uses React.createElement directly (no JSX) so we don't need a JSX
|
|
539
|
+
* transform in the existing tsup build.
|
|
540
|
+
* - The `ink` / `react` modules are typed as `any` at the boundary because
|
|
541
|
+
* they're optional peer deps; we don't want to require `@types/react`
|
|
542
|
+
* just to satisfy strict typecheck.
|
|
543
|
+
*/
|
|
544
|
+
|
|
545
|
+
interface TuiOptions {
|
|
546
|
+
readonly files?: readonly string[];
|
|
547
|
+
readonly config?: GoldenMatchConfig;
|
|
548
|
+
}
|
|
549
|
+
/**
|
|
550
|
+
* Launch the GoldenMatch TUI. Resolves once the user quits.
|
|
551
|
+
*/
|
|
552
|
+
declare function startTui(options?: TuiOptions): Promise<void>;
|
|
553
|
+
|
|
554
|
+
export { AGENT_CARD, type AgentSkill, type BaseConnector, type BigQueryConfig, BlockResult, type ConnectorConfig, type ConnectorFactory, type ConnectorQuery, type DatabricksConfig, DedupeResult, type DuckDBConfig, type DuckDBConnector, type FileDedupeOptions, type FileSpec, GoldenMatchConfig, type HubSpotConfig, MatchResult, MatchkeyConfig, PairKey, type ParallelWorkerOptions, type PostgresConfig, type PostgresConnector, type PostgresWriteOptions, type ReadCsvOptions, ReviewQueue, Row, type SalesforceConfig, ScoredPair, type SnowflakeConfig, type StartA2aOptions, type StartApiOptions, type SyncOptions, TOOLS, type TuiOptions, type WatchSyncOptions, type WorkerPoolOptions, type WriteCsvOptions, createBigQueryConnector, createDatabricksConnector, createDuckDBConnector, createHubSpotConnector, createPostgresConnector, createSalesforceConnector, createSnowflakeConnector, dedupeFile, handleTool, listConnectors, loadConfigFile, loadConnector, matchFiles, readCsv, readFile, readJson, registerConnector, scoreBlocksConcurrent, scoreBlocksParallel, startA2aServer, startApiServer, startMcpServer, startTui, syncDedupe, watchSync, writeConfigFile, writeCsv, writeJson };
|