goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,258 @@
1
+ /**
2
+ * budget.ts — LLM budget tracking: cost accounting, model tiering,
3
+ * and graceful degradation. Ports `goldenmatch/core/llm_budget.py`.
4
+ *
5
+ * Edge-safe: no `node:` imports, no `process`.
6
+ */
7
+
8
+ import type { BudgetConfig } from "../types.js";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Model pricing (per 1M tokens, USD)
12
+ //
13
+ // The Python reference uses per-1K tokens; we use per-1M internally because
14
+ // that matches vendor pricing pages. The `estimateCost` math divides by
15
+ // 1_000_000, so the final numbers match either convention.
16
+ // ---------------------------------------------------------------------------
17
+
18
+ const PRICING: Record<string, { readonly input: number; readonly output: number }> = {
19
+ "gpt-4o-mini": { input: 0.15, output: 0.6 },
20
+ "gpt-4o": { input: 2.5, output: 10.0 },
21
+ "gpt-4-turbo": { input: 10.0, output: 30.0 },
22
+ "claude-3-5-haiku-latest": { input: 0.8, output: 4.0 },
23
+ "claude-3-5-sonnet-latest": { input: 3.0, output: 15.0 },
24
+ "claude-haiku-4-5-20251001": { input: 0.8, output: 4.0 },
25
+ "claude-sonnet-4-20250514": { input: 3.0, output: 15.0 },
26
+ "claude-opus-4-6": { input: 15.0, output: 75.0 },
27
+ };
28
+
29
+ /** Default pricing when a model isn't in the table. */
30
+ const DEFAULT_PRICING = { input: 1.0, output: 4.0 };
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // Snapshot type
34
+ // ---------------------------------------------------------------------------
35
+
36
+ export interface BudgetSnapshot {
37
+ readonly calls: number;
38
+ readonly inputTokens: number;
39
+ readonly outputTokens: number;
40
+ readonly costUsd: number;
41
+ readonly model: string;
42
+ readonly modelsUsed: Readonly<Record<string, number>>;
43
+ readonly remainingCalls: number | null;
44
+ readonly remainingUsd: number | null;
45
+ readonly pctUsed: number;
46
+ readonly exhausted: boolean;
47
+ }
48
+
49
+ // ---------------------------------------------------------------------------
50
+ // BudgetTracker
51
+ // ---------------------------------------------------------------------------
52
+
53
+ /**
54
+ * Tracks LLM token usage, cost, and enforces budget limits.
55
+ *
56
+ * Mirrors `goldenmatch.core.llm_budget.BudgetTracker`. No thread lock
57
+ * is needed here — the edge runtime is single-threaded per request.
58
+ */
59
+ export class BudgetTracker {
60
+ private _calls = 0;
61
+ private _inputTokens = 0;
62
+ private _outputTokens = 0;
63
+ private _costUsd = 0;
64
+ private _escalationCost = 0;
65
+ private readonly _modelsUsed: Record<string, number> = {};
66
+
67
+ constructor(
68
+ private readonly config: BudgetConfig = {},
69
+ public readonly model: string = "gpt-4o-mini",
70
+ ) {}
71
+
72
+ // ──────────────────────────────────────────────────────────
73
+ // Cost estimation
74
+ // ──────────────────────────────────────────────────────────
75
+
76
+ /** Estimate the cost of a hypothetical call (USD). */
77
+ estimateCost(
78
+ inputTokens: number,
79
+ outputTokens: number,
80
+ model?: string,
81
+ ): number {
82
+ const m = model ?? this.model;
83
+ const p = PRICING[m] ?? DEFAULT_PRICING;
84
+ return (
85
+ (inputTokens / 1_000_000) * p.input +
86
+ (outputTokens / 1_000_000) * p.output
87
+ );
88
+ }
89
+
90
+ // ──────────────────────────────────────────────────────────
91
+ // Recording usage
92
+ // ──────────────────────────────────────────────────────────
93
+
94
+ /** Record usage from a completed API call. */
95
+ record(inputTokens: number, outputTokens: number, model?: string): void {
96
+ const m = model ?? this.model;
97
+ const cost = this.estimateCost(inputTokens, outputTokens, m);
98
+ this._calls += 1;
99
+ this._inputTokens += inputTokens;
100
+ this._outputTokens += outputTokens;
101
+ this._costUsd += cost;
102
+ this._modelsUsed[m] = (this._modelsUsed[m] ?? 0) + 1;
103
+ if (this.config.escalationModel && m === this.config.escalationModel) {
104
+ this._escalationCost += cost;
105
+ }
106
+ }
107
+
108
+ // ──────────────────────────────────────────────────────────
109
+ // Budget checks
110
+ // ──────────────────────────────────────────────────────────
111
+
112
+ /**
113
+ * Return true if another call can proceed without exceeding the budget.
114
+ * If `estimatedCost` is provided, checks whether the projected total stays
115
+ * under `maxCostUsd`.
116
+ */
117
+ canProceed(estimatedCost?: number): boolean {
118
+ if (this.config.maxCalls !== undefined && this._calls >= this.config.maxCalls) {
119
+ return false;
120
+ }
121
+ if (
122
+ this.config.maxCostUsd !== undefined &&
123
+ this._costUsd >= this.config.maxCostUsd
124
+ ) {
125
+ return false;
126
+ }
127
+ if (
128
+ estimatedCost !== undefined &&
129
+ this.config.maxCostUsd !== undefined &&
130
+ this._costUsd + estimatedCost > this.config.maxCostUsd
131
+ ) {
132
+ return false;
133
+ }
134
+ return true;
135
+ }
136
+
137
+ /**
138
+ * Estimate whether a batch of a given token size can be sent.
139
+ * Mirrors Python's `can_send(estimated_tokens)`.
140
+ */
141
+ canSend(estimatedTokens: number): boolean {
142
+ if (!this.canProceed()) return false;
143
+ if (this.config.maxCostUsd === undefined) return true;
144
+ const est = this.estimateCost(estimatedTokens, 0, this.model);
145
+ return this._costUsd + est <= this.config.maxCostUsd;
146
+ }
147
+
148
+ /**
149
+ * Pick a model based on a pair score and escalation config.
150
+ * Returns `escalationModel` when the score is in the escalation band
151
+ * and the escalation sub-budget hasn't been exhausted.
152
+ */
153
+ selectModel(pairScore: number, defaultModel: string): string {
154
+ if (!this.config.escalationModel) return defaultModel;
155
+ const band = this.config.escalationBand;
156
+ if (band === undefined || band.length < 2) return defaultModel;
157
+ const lo = band[0]!;
158
+ const hi = band[1]!;
159
+ if (pairScore < lo || pairScore > hi) return defaultModel;
160
+
161
+ if (
162
+ this.config.maxCostUsd !== undefined &&
163
+ this.config.escalationBudgetPct !== undefined
164
+ ) {
165
+ const maxEscalation =
166
+ this.config.maxCostUsd * (this.config.escalationBudgetPct / 100);
167
+ if (this._escalationCost >= maxEscalation) return defaultModel;
168
+ }
169
+ return this.config.escalationModel;
170
+ }
171
+
172
+ // ──────────────────────────────────────────────────────────
173
+ // Accessors
174
+ // ──────────────────────────────────────────────────────────
175
+
176
+ get costUsd(): number {
177
+ return Math.round(this._costUsd * 1e6) / 1e6;
178
+ }
179
+
180
+ get calls(): number {
181
+ return this._calls;
182
+ }
183
+
184
+ get inputTokens(): number {
185
+ return this._inputTokens;
186
+ }
187
+
188
+ get outputTokens(): number {
189
+ return this._outputTokens;
190
+ }
191
+
192
+ get exhausted(): boolean {
193
+ if (
194
+ this.config.maxCostUsd !== undefined &&
195
+ this._costUsd >= this.config.maxCostUsd
196
+ ) {
197
+ return true;
198
+ }
199
+ if (
200
+ this.config.maxCalls !== undefined &&
201
+ this._calls >= this.config.maxCalls
202
+ ) {
203
+ return true;
204
+ }
205
+ return false;
206
+ }
207
+
208
+ /** Return a snapshot of the current budget state. */
209
+ snapshot(): BudgetSnapshot {
210
+ const maxCalls = this.config.maxCalls;
211
+ const maxCost = this.config.maxCostUsd;
212
+
213
+ const remainingCalls =
214
+ maxCalls !== undefined ? Math.max(0, maxCalls - this._calls) : null;
215
+ const remainingUsd =
216
+ maxCost !== undefined ? Math.max(0, maxCost - this._costUsd) : null;
217
+
218
+ let pctUsed = 0;
219
+ if (maxCost !== undefined && maxCost > 0) {
220
+ pctUsed = Math.min(100, (this._costUsd / maxCost) * 100);
221
+ } else if (maxCalls !== undefined && maxCalls > 0) {
222
+ pctUsed = Math.min(100, (this._calls / maxCalls) * 100);
223
+ }
224
+
225
+ return {
226
+ calls: this._calls,
227
+ inputTokens: this._inputTokens,
228
+ outputTokens: this._outputTokens,
229
+ costUsd: this.costUsd,
230
+ model: this.model,
231
+ modelsUsed: { ...this._modelsUsed },
232
+ remainingCalls,
233
+ remainingUsd: remainingUsd === null ? null : Math.round(remainingUsd * 1e6) / 1e6,
234
+ pctUsed: Math.round(pctUsed * 10) / 10,
235
+ exhausted: this.exhausted,
236
+ };
237
+ }
238
+ }
239
+
240
+ // ---------------------------------------------------------------------------
241
+ // Token counting
242
+ // ---------------------------------------------------------------------------
243
+
244
+ /**
245
+ * Rough token count approximation.
246
+ * Rule of thumb: ~4 chars per token for English text.
247
+ */
248
+ export function countTokensApprox(text: string): number {
249
+ if (!text) return 0;
250
+ return Math.ceil(text.length / 4);
251
+ }
252
+
253
+ /** Return the pricing table (read-only) for inspection/tests. */
254
+ export function getPricing(): Readonly<
255
+ Record<string, { readonly input: number; readonly output: number }>
256
+ > {
257
+ return PRICING;
258
+ }