fullstackgtm 0.22.0 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/enrich.ts ADDED
@@ -0,0 +1,1016 @@
1
+ import { mkdirSync, readFileSync, readdirSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { credentialsDir, ensureSecureHomeDir, writeSecureFile } from "./credentials.ts";
4
+ import { HUBSPOT_DEFAULT_FIELD_MAPPINGS } from "./mappings.ts";
5
+ import type {
6
+ CanonicalGtmSnapshot,
7
+ GtmEvidence,
8
+ PatchOperation,
9
+ PatchPlan,
10
+ } from "./types.ts";
11
+
12
+ /**
13
+ * The enrich layer: governed append/refresh of third-party data into the CRM.
14
+ *
15
+ * Every enrichment vendor ships fire-and-forget writeback — data lands without
16
+ * a diff, without approval, over whatever a human typed. This layer inverts
17
+ * that: a source (Apollo pull, Clay ingest) feeds a deterministic matcher,
18
+ * the matcher feeds a fill-blanks-only patch plan, and the plan goes through
19
+ * the existing dry-run → approval → apply contract. Every proposed value is
20
+ * traceable to the source payload that produced it (`GtmEvidence` on the
21
+ * plan), and every write carries a `beforeValue` for apply-time
22
+ * compare-and-set.
23
+ *
24
+ * State lives in a profile-scoped, append-only run store
25
+ * (`~/.fullstackgtm/profiles/<profile>/enrich/runs/`) that is checkpoint,
26
+ * staleness ledger, and observability surface in one. The CLI never writes
27
+ * `fsgtm_enriched_at`-style custom properties into the customer's portal.
28
+ *
29
+ * Recurring execution belongs to the horizontal scheduler (docs/schedule.md);
30
+ * enrich owns no cron logic.
31
+ */
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // Config: enrich.config.json
35
+
36
+ export type EnrichObjectType = "company" | "contact";
37
+
38
+ export type EnrichSourceKind = "api" | "ingest";
39
+
40
+ export type EnrichSourceConfig = {
41
+ kind: EnrichSourceKind;
42
+ /** Ingest staging format; csv (column headers) or json (dotted paths). */
43
+ format?: "csv" | "json";
44
+ };
45
+
46
+ export type EnrichAmbiguousPolicy = "skip" | "suggest";
47
+
48
+ export type EnrichMatchConfig = {
49
+ /** Ordered match keys, evaluated against the snapshot. */
50
+ keys: string[];
51
+ /** Multi-hit behavior; default skip. */
52
+ onAmbiguous?: EnrichAmbiguousPolicy;
53
+ };
54
+
55
+ export type EnrichFieldConfig = {
56
+ /** CRM property: canonical field name, or a default HubSpot property name. */
57
+ crm: string;
58
+ /** sourceId → dotted JSON path (api/json) or column header (ingest csv). */
59
+ from: Record<string, string>;
60
+ /** Opt into `enrich refresh`; fields without it are set once, never revisited. */
61
+ refresh?: boolean;
62
+ /** Staleness window for refresh; falls back to policy.defaultStaleDays. */
63
+ staleDays?: number;
64
+ /** Per-field conflict policy override. MVP: only "never". */
65
+ policy?: "never";
66
+ };
67
+
68
+ export type EnrichPolicyConfig = {
69
+ /** Conflict policy ladder. MVP ships "never" (fill blanks only). */
70
+ overwrite: "never";
71
+ defaultStaleDays?: number;
72
+ };
73
+
74
+ export type EnrichConfig = {
75
+ sources: Record<string, EnrichSourceConfig>;
76
+ match: Partial<Record<EnrichObjectType, EnrichMatchConfig>>;
77
+ fields: Partial<Record<EnrichObjectType, EnrichFieldConfig[]>>;
78
+ policy: EnrichPolicyConfig;
79
+ };
80
+
81
+ export const ENRICH_CONFIG_FILE_NAME = "enrich.config.json";
82
+
83
+ export const DEFAULT_STALE_DAYS = 90;
84
+
85
+ const OBJECT_TYPES: EnrichObjectType[] = ["company", "contact"];
86
+
87
+ /** Match keys the matcher knows how to read off canonical snapshot records. */
88
+ const MATCH_KEYS: Record<EnrichObjectType, string[]> = {
89
+ company: ["domain", "name"],
90
+ contact: ["email", "name"],
91
+ };
92
+
93
+ /** API source ids the MVP can pull from. */
94
+ export const SUPPORTED_API_SOURCES = ["apollo"];
95
+
96
+ /**
97
+ * Canonical fields enrich may target, plus the HubSpot property spellings the
98
+ * config may use for them (so `"crm": "numberofemployees"` and
99
+ * `"crm": "employeeCount"` both resolve). Reading the current value for the
100
+ * fill-blanks check happens against the canonical snapshot, so only fields
101
+ * with a canonical home are accepted — strict, with the accepted names in the
102
+ * error.
103
+ */
104
+ const CANONICAL_FIELDS: Record<EnrichObjectType, string[]> = {
105
+ company: ["name", "domain", "industry", "employeeCount", "annualRevenue"],
106
+ contact: ["firstName", "lastName", "email", "phone", "title"],
107
+ };
108
+
109
+ const PROVIDER_FIELD_ALIASES: Record<EnrichObjectType, Record<string, string>> = {
110
+ company: invertMapping(HUBSPOT_DEFAULT_FIELD_MAPPINGS.accounts),
111
+ contact: invertMapping(HUBSPOT_DEFAULT_FIELD_MAPPINGS.contacts),
112
+ };
113
+
114
+ function invertMapping(mapping: Record<string, string>): Record<string, string> {
115
+ const inverted: Record<string, string> = {};
116
+ for (const [canonical, provider] of Object.entries(mapping)) inverted[provider] = canonical;
117
+ return inverted;
118
+ }
119
+
120
+ /** Resolve a config `crm` field name to the canonical snapshot field. */
121
+ export function resolveCrmField(objectType: EnrichObjectType, name: string): string {
122
+ if (CANONICAL_FIELDS[objectType].includes(name)) return name;
123
+ const canonical = PROVIDER_FIELD_ALIASES[objectType][name];
124
+ if (canonical && CANONICAL_FIELDS[objectType].includes(canonical)) return canonical;
125
+ throw new Error(
126
+ `enrich config: unknown ${objectType} field "${name}". Accepted canonical fields: ` +
127
+ `${CANONICAL_FIELDS[objectType].join(", ")} (HubSpot property spellings like ` +
128
+ `${Object.keys(PROVIDER_FIELD_ALIASES[objectType]).join(", ")} also resolve).`,
129
+ );
130
+ }
131
+
132
+ function fail(message: string): never {
133
+ throw new Error(`enrich config: ${message}`);
134
+ }
135
+
136
+ /**
137
+ * Strict, up-front validation (the 0.18 lesson: a config crash mid-run is
138
+ * worse than a refused config). Every problem names the offending entry and
139
+ * the accepted values.
140
+ */
141
+ export function parseEnrichConfig(raw: string): EnrichConfig {
142
+ let parsed: unknown;
143
+ try {
144
+ parsed = JSON.parse(raw);
145
+ } catch (error) {
146
+ fail(`not valid JSON (${error instanceof Error ? error.message : String(error)})`);
147
+ }
148
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
149
+ fail("expected a JSON object with sources, match, fields, and policy");
150
+ }
151
+ const config = parsed as Partial<EnrichConfig>;
152
+
153
+ // sources
154
+ if (!config.sources || typeof config.sources !== "object" || Array.isArray(config.sources)) {
155
+ fail('missing "sources" — declare at least one, e.g. { "apollo": { "kind": "api" } }');
156
+ }
157
+ const sourceIds = Object.keys(config.sources);
158
+ if (sourceIds.length === 0) fail('"sources" is empty — declare at least one source');
159
+ for (const [id, source] of Object.entries(config.sources)) {
160
+ if (!source || typeof source !== "object") fail(`source "${id}" must be an object`);
161
+ if (source.kind !== "api" && source.kind !== "ingest") {
162
+ fail(`source "${id}": kind must be "api" or "ingest" (got ${JSON.stringify(source.kind)})`);
163
+ }
164
+ if (source.kind === "api" && !SUPPORTED_API_SOURCES.includes(id)) {
165
+ fail(
166
+ `api source "${id}" is not supported yet — MVP pulls from: ${SUPPORTED_API_SOURCES.join(", ")}. ` +
167
+ 'Push-style sources stage data via `enrich ingest` with kind "ingest".',
168
+ );
169
+ }
170
+ if (source.format !== undefined && source.format !== "csv" && source.format !== "json") {
171
+ fail(`source "${id}": format must be "csv" or "json" (got ${JSON.stringify(source.format)})`);
172
+ }
173
+ }
174
+
175
+ // policy
176
+ if (!config.policy || typeof config.policy !== "object") {
177
+ fail('missing "policy" — e.g. { "overwrite": "never", "defaultStaleDays": 90 }');
178
+ }
179
+ const overwrite = (config.policy as EnrichPolicyConfig).overwrite;
180
+ if (overwrite === ("system-only" as string) || overwrite === ("always" as string)) {
181
+ fail(
182
+ `policy.overwrite "${overwrite}" is not yet implemented (phase 2 of the conflict ladder — ` +
183
+ 'it needs per-field property history). MVP supports only "never" (fill blanks).',
184
+ );
185
+ }
186
+ if (overwrite !== "never") {
187
+ fail(`policy.overwrite must be "never" (got ${JSON.stringify(overwrite)})`);
188
+ }
189
+ const defaultStaleDays = config.policy.defaultStaleDays;
190
+ if (defaultStaleDays !== undefined && (!Number.isFinite(defaultStaleDays) || defaultStaleDays <= 0)) {
191
+ fail(`policy.defaultStaleDays must be a positive number (got ${JSON.stringify(defaultStaleDays)})`);
192
+ }
193
+
194
+ // match
195
+ if (!config.match || typeof config.match !== "object" || Array.isArray(config.match)) {
196
+ fail('missing "match" — e.g. { "company": { "keys": ["domain", "name"] } }');
197
+ }
198
+ for (const [objectType, match] of Object.entries(config.match)) {
199
+ if (!OBJECT_TYPES.includes(objectType as EnrichObjectType)) {
200
+ fail(`match has unknown object type "${objectType}" (use: ${OBJECT_TYPES.join(", ")})`);
201
+ }
202
+ if (!match || !Array.isArray(match.keys) || match.keys.length === 0) {
203
+ fail(`match.${objectType}: "keys" must be a non-empty ordered array`);
204
+ }
205
+ for (const key of match.keys) {
206
+ const known = MATCH_KEYS[objectType as EnrichObjectType];
207
+ if (!known.includes(key)) {
208
+ fail(`match.${objectType}: unknown key "${key}" (supported: ${known.join(", ")})`);
209
+ }
210
+ }
211
+ if (match.onAmbiguous !== undefined && match.onAmbiguous !== "skip" && match.onAmbiguous !== "suggest") {
212
+ fail(`match.${objectType}: onAmbiguous must be "skip" or "suggest" (got ${JSON.stringify(match.onAmbiguous)})`);
213
+ }
214
+ }
215
+
216
+ // fields
217
+ if (!config.fields || typeof config.fields !== "object" || Array.isArray(config.fields)) {
218
+ fail('missing "fields" — map CRM properties to source paths per object type');
219
+ }
220
+ let anyField = false;
221
+ for (const [objectType, fields] of Object.entries(config.fields)) {
222
+ if (!OBJECT_TYPES.includes(objectType as EnrichObjectType)) {
223
+ fail(`fields has unknown object type "${objectType}" (use: ${OBJECT_TYPES.join(", ")})`);
224
+ }
225
+ if (!Array.isArray(fields)) fail(`fields.${objectType} must be an array`);
226
+ if (!config.match[objectType as EnrichObjectType]) {
227
+ fail(`fields.${objectType} is configured but match.${objectType} is missing — the matcher needs ordered keys`);
228
+ }
229
+ const seen = new Set<string>();
230
+ for (const field of fields) {
231
+ anyField = true;
232
+ if (!field || typeof field.crm !== "string" || field.crm.length === 0) {
233
+ fail(`fields.${objectType}: every entry needs a "crm" property name`);
234
+ }
235
+ const canonical = resolveCrmField(objectType as EnrichObjectType, field.crm);
236
+ if (seen.has(canonical)) fail(`fields.${objectType}: duplicate mapping for "${field.crm}"`);
237
+ seen.add(canonical);
238
+ if (!field.from || typeof field.from !== "object" || Object.keys(field.from).length === 0) {
239
+ fail(`fields.${objectType}.${field.crm}: "from" must map at least one source to a path`);
240
+ }
241
+ for (const [sourceId, path] of Object.entries(field.from)) {
242
+ if (!config.sources[sourceId]) {
243
+ fail(
244
+ `fields.${objectType}.${field.crm}: "from" references undeclared source "${sourceId}" ` +
245
+ `(declared: ${sourceIds.join(", ")})`,
246
+ );
247
+ }
248
+ if (typeof path !== "string" || path.length === 0) {
249
+ fail(`fields.${objectType}.${field.crm}: path for source "${sourceId}" must be a non-empty string`);
250
+ }
251
+ }
252
+ if (field.staleDays !== undefined && (!Number.isFinite(field.staleDays) || field.staleDays <= 0)) {
253
+ fail(`fields.${objectType}.${field.crm}: staleDays must be a positive number`);
254
+ }
255
+ if (field.refresh !== undefined && typeof field.refresh !== "boolean") {
256
+ fail(`fields.${objectType}.${field.crm}: refresh must be true or false`);
257
+ }
258
+ if (field.policy !== undefined && field.policy !== "never") {
259
+ fail(
260
+ `fields.${objectType}.${field.crm}: per-field policy "${String(field.policy)}" is not yet ` +
261
+ 'implemented (phase 2 of the conflict ladder). MVP supports only "never".',
262
+ );
263
+ }
264
+ }
265
+ }
266
+ if (!anyField) fail('"fields" maps nothing — add at least one field entry');
267
+
268
+ return config as EnrichConfig;
269
+ }
270
+
271
+ export function loadEnrichConfig(path: string): EnrichConfig {
272
+ let raw: string;
273
+ try {
274
+ raw = readFileSync(path, "utf8");
275
+ } catch {
276
+ throw new Error(
277
+ `No enrich config at ${path}. Create ${ENRICH_CONFIG_FILE_NAME} (sources/match/fields/policy — ` +
278
+ "see docs/enrich.md) or pass --config <path>.",
279
+ );
280
+ }
281
+ return parseEnrichConfig(raw);
282
+ }
283
+
284
+ // ---------------------------------------------------------------------------
285
+ // CSV: minimal dependency-free RFC-4180-ish parser (quoted fields, embedded
286
+ // commas/newlines, "" escapes, CRLF). Header row maps columns to names.
287
+
288
+ export function parseCsv(text: string): Array<Record<string, string>> {
289
+ const rows: string[][] = [];
290
+ let row: string[] = [];
291
+ let field = "";
292
+ let inQuotes = false;
293
+ let sawAny = false;
294
+
295
+ const pushField = () => {
296
+ row.push(field);
297
+ field = "";
298
+ };
299
+ const pushRow = () => {
300
+ pushField();
301
+ rows.push(row);
302
+ row = [];
303
+ };
304
+
305
+ for (let i = 0; i < text.length; i += 1) {
306
+ const char = text[i];
307
+ if (inQuotes) {
308
+ if (char === '"') {
309
+ if (text[i + 1] === '"') {
310
+ field += '"';
311
+ i += 1;
312
+ } else {
313
+ inQuotes = false;
314
+ }
315
+ } else {
316
+ field += char;
317
+ }
318
+ continue;
319
+ }
320
+ if (char === '"') {
321
+ inQuotes = true;
322
+ sawAny = true;
323
+ continue;
324
+ }
325
+ if (char === ",") {
326
+ pushField();
327
+ sawAny = true;
328
+ continue;
329
+ }
330
+ if (char === "\n" || char === "\r") {
331
+ if (char === "\r" && text[i + 1] === "\n") i += 1;
332
+ // Skip empty lines (including the trailing newline).
333
+ if (field.length > 0 || row.length > 0) pushRow();
334
+ continue;
335
+ }
336
+ field += char;
337
+ sawAny = true;
338
+ }
339
+ if (inQuotes) throw new Error("CSV parse error: unterminated quoted field");
340
+ if (field.length > 0 || row.length > 0) pushRow();
341
+ if (!sawAny || rows.length === 0) return [];
342
+
343
+ const headers = rows[0].map((header) => header.trim());
344
+ return rows.slice(1).map((cells) => {
345
+ const record: Record<string, string> = {};
346
+ headers.forEach((header, index) => {
347
+ if (header) record[header] = cells[index] ?? "";
348
+ });
349
+ return record;
350
+ });
351
+ }
352
+
353
+ // ---------------------------------------------------------------------------
354
+ // Source records
355
+
356
+ export type EnrichSourceRecord = {
357
+ /** e.g. "apollo:org_abc", "clay:row-3". Lands on stamps as sourceRecordId. */
358
+ id: string;
359
+ objectType: EnrichObjectType;
360
+ /** Match-key values (key name → raw value), extracted by the source adapter. */
361
+ keys: Record<string, string | undefined>;
362
+ /** Raw source payload; field paths and evidence excerpts read from it. */
363
+ payload: Record<string, unknown>;
364
+ };
365
+
366
+ /** Read a value from a payload: exact key first (CSV headers), then dotted path. */
367
+ export function sourceValueAt(payload: Record<string, unknown>, path: string): unknown {
368
+ if (path in payload) return payload[path];
369
+ let current: unknown = payload;
370
+ for (const segment of path.split(".")) {
371
+ if (!current || typeof current !== "object" || Array.isArray(current)) return undefined;
372
+ current = (current as Record<string, unknown>)[segment];
373
+ }
374
+ return current;
375
+ }
376
+
377
+ /** Case-insensitive header lookup for ingest rows ("Email" matches key "email"). */
378
+ export function ingestKeyValue(row: Record<string, unknown>, key: string): string | undefined {
379
+ for (const [header, value] of Object.entries(row)) {
380
+ if (header.trim().toLowerCase() === key.toLowerCase()) {
381
+ const text = valueToString(value);
382
+ return text || undefined;
383
+ }
384
+ }
385
+ const dotted = sourceValueAt(row, key);
386
+ const text = valueToString(dotted);
387
+ return text || undefined;
388
+ }
389
+
390
+ function valueToString(value: unknown): string {
391
+ if (value === null || value === undefined) return "";
392
+ if (typeof value === "string") return value.trim();
393
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
394
+ return "";
395
+ }
396
+
397
+ // ---------------------------------------------------------------------------
398
+ // Matching: ordered keys, unique-hit-wins, zero-hits-next-key,
399
+ // multi-hit → onAmbiguous. Ambiguity is surfaced, never resolved by coin flip.
400
+
401
+ export type MatchOutcome =
402
+ | { status: "matched"; recordId: string; matchedKey: string }
403
+ | { status: "unmatched" }
404
+ | { status: "ambiguous"; key: string; candidateIds: string[] };
405
+
406
+ function normalizeKeyValue(key: string, value: unknown): string {
407
+ const text = valueToString(value).toLowerCase();
408
+ if (!text) return "";
409
+ if (key === "domain") {
410
+ return text
411
+ .replace(/^https?:\/\//, "")
412
+ .replace(/^www\./, "")
413
+ .replace(/\/.*$/, "");
414
+ }
415
+ return text.replace(/\s+/g, " ");
416
+ }
417
+
418
+ function crmKeyValue(
419
+ objectType: EnrichObjectType,
420
+ record: { name?: string; domain?: string; email?: string; firstName?: string; lastName?: string },
421
+ key: string,
422
+ ): string {
423
+ if (objectType === "company") {
424
+ if (key === "domain") return normalizeKeyValue("domain", record.domain);
425
+ if (key === "name") return normalizeKeyValue("name", record.name);
426
+ return "";
427
+ }
428
+ if (key === "email") return normalizeKeyValue("email", record.email);
429
+ if (key === "name") {
430
+ return normalizeKeyValue("name", `${record.firstName ?? ""} ${record.lastName ?? ""}`.trim());
431
+ }
432
+ return "";
433
+ }
434
+
435
+ export function matchSourceRecord(
436
+ snapshot: CanonicalGtmSnapshot,
437
+ objectType: EnrichObjectType,
438
+ keys: string[],
439
+ sourceKeys: Record<string, string | undefined>,
440
+ ): MatchOutcome {
441
+ const records: Array<Record<string, unknown> & { id: string }> =
442
+ objectType === "company" ? snapshot.accounts : snapshot.contacts;
443
+ for (const key of keys) {
444
+ const wanted = normalizeKeyValue(key, sourceKeys[key]);
445
+ if (!wanted) continue;
446
+ const hits = records.filter(
447
+ (record) => crmKeyValue(objectType, record as never, key) === wanted,
448
+ );
449
+ if (hits.length === 1) return { status: "matched", recordId: hits[0].id, matchedKey: key };
450
+ if (hits.length > 1) {
451
+ return { status: "ambiguous", key, candidateIds: hits.map((hit) => hit.id) };
452
+ }
453
+ // Zero hits: fall through to the next key.
454
+ }
455
+ return { status: "unmatched" };
456
+ }
457
+
458
+ // ---------------------------------------------------------------------------
459
+ // Plan building
460
+
461
+ // Mirrors stableHash in rules.ts (FNV-1a); duplicated to keep enrich.ts
462
+ // importable without pulling the audit engine (the market.ts precedent).
463
+ function fnv1a(value: string): string {
464
+ let hash = 0x811c9dc5;
465
+ for (let i = 0; i < value.length; i += 1) {
466
+ hash ^= value.charCodeAt(i);
467
+ hash = Math.imul(hash, 0x01000193);
468
+ }
469
+ return (hash >>> 0).toString(16).padStart(8, "0");
470
+ }
471
+
472
+ export type EnrichMode = "append" | "refresh";
473
+
474
+ export type EnrichCounts = {
475
+ fetched: number;
476
+ matched: number;
477
+ unmatched: number;
478
+ ambiguous: number;
479
+ opsEmitted: number;
480
+ };
481
+
482
+ export type EnrichStamp = {
483
+ objectType: EnrichObjectType;
484
+ objectId: string;
485
+ /** Canonical field name. */
486
+ field: string;
487
+ enrichedAt: string;
488
+ sourceRecordId: string;
489
+ /** Source value at stamp time (refresh change-detection observability). */
490
+ value?: unknown;
491
+ };
492
+
493
+ export type EnrichAmbiguity = {
494
+ sourceRecordId: string;
495
+ key: string;
496
+ candidateIds: string[];
497
+ };
498
+
499
+ export type EnrichWorkItem = {
500
+ objectType: EnrichObjectType;
501
+ objectId: string;
502
+ /** Canonical field name. */
503
+ field: string;
504
+ };
505
+
506
+ export type BuildEnrichPlanOptions = {
507
+ config: EnrichConfig;
508
+ source: string;
509
+ mode: EnrichMode;
510
+ snapshot: CanonicalGtmSnapshot;
511
+ records: EnrichSourceRecord[];
512
+ /**
513
+ * Refresh only: the stale (record, field) work set computed from run-store
514
+ * stamps. Refresh proposes writes ONLY for work-set cells — fields the
515
+ * ledger proves enrich itself stamped — so policy "never" still never
516
+ * overwrites a value enrich did not put there.
517
+ */
518
+ workSet?: EnrichWorkItem[];
519
+ now?: () => Date;
520
+ runLabel: string;
521
+ };
522
+
523
+ export type EnrichPlanResult = {
524
+ plan: PatchPlan;
525
+ counts: EnrichCounts;
526
+ stamps: EnrichStamp[];
527
+ ambiguities: EnrichAmbiguity[];
528
+ unmatchedSourceIds: string[];
529
+ };
530
+
531
+ const PLACEHOLDER_RECORD_SELECTION = "requires_human_record_selection";
532
+
533
+ function canonicalObjectType(objectType: EnrichObjectType): "account" | "contact" {
534
+ return objectType === "company" ? "account" : "contact";
535
+ }
536
+
537
+ function crmFieldValue(
538
+ snapshot: CanonicalGtmSnapshot,
539
+ objectType: EnrichObjectType,
540
+ objectId: string,
541
+ field: string,
542
+ ): unknown {
543
+ const records = objectType === "company" ? snapshot.accounts : snapshot.contacts;
544
+ const record = records.find((entry) => entry.id === objectId);
545
+ return record ? (record as unknown as Record<string, unknown>)[field] : undefined;
546
+ }
547
+
548
+ function isEmptyValue(value: unknown): boolean {
549
+ return value === undefined || value === null || (typeof value === "string" && value.trim() === "");
550
+ }
551
+
552
+ /** Values compare as trimmed strings; numbers compare numerically. */
553
+ function sameValue(a: unknown, b: unknown): boolean {
554
+ if (isEmptyValue(a) && isEmptyValue(b)) return true;
555
+ if (typeof a === "number" || typeof b === "number") {
556
+ return Number(a) === Number(b);
557
+ }
558
+ return valueToString(a) === valueToString(b);
559
+ }
560
+
561
+ function describeSourceRecord(record: EnrichSourceRecord): string {
562
+ const name = record.keys.name ?? record.keys.domain ?? record.keys.email ?? record.id;
563
+ return String(name);
564
+ }
565
+
566
+ function evidenceFor(
567
+ source: string,
568
+ sourceKind: EnrichSourceKind,
569
+ format: "csv" | "json" | undefined,
570
+ record: EnrichSourceRecord,
571
+ matchedKey: string | undefined,
572
+ capturedAt: string,
573
+ ): GtmEvidence {
574
+ const excerpt = JSON.stringify(record.payload);
575
+ return {
576
+ id: `ev_enr_${fnv1a(`${source}:${record.id}`)}`,
577
+ sourceSystem: sourceKind === "api" ? "web" : format === "csv" ? "csv" : "manual",
578
+ sourceObjectType: record.objectType,
579
+ sourceObjectId: record.id,
580
+ title: `${source} payload for ${describeSourceRecord(record)}`,
581
+ text: excerpt.length > 1200 ? `${excerpt.slice(0, 1200)}…` : excerpt,
582
+ capturedAt,
583
+ metadata: { source, sourceRecordId: record.id, matchedKey: matchedKey ?? null },
584
+ };
585
+ }
586
+
587
+ /**
588
+ * Match source records against the snapshot and emit a patch plan under the
589
+ * conflict policy. Append fills blanks only; refresh proposes updates for
590
+ * stale stamped fields whose source value actually changed (beforeValue =
591
+ * current CRM value → apply-time compare-and-set rejects drifted records).
592
+ */
593
+ export function buildEnrichPlan(options: BuildEnrichPlanOptions): EnrichPlanResult {
594
+ const { config, source, mode, snapshot, records, runLabel } = options;
595
+ const nowIso = (options.now ?? (() => new Date()))().toISOString();
596
+ const sourceConfig = config.sources[source];
597
+ if (!sourceConfig) throw new Error(`enrich: source "${source}" is not declared in the config`);
598
+
599
+ const workSet = options.workSet ?? [];
600
+ const workKeys = new Set(workSet.map((item) => `${item.objectType}|${item.objectId}|${item.field}`));
601
+
602
+ const operations: PatchOperation[] = [];
603
+ const evidence: GtmEvidence[] = [];
604
+ const stamps: EnrichStamp[] = [];
605
+ const ambiguities: EnrichAmbiguity[] = [];
606
+ const unmatchedSourceIds: string[] = [];
607
+ const counts: EnrichCounts = { fetched: records.length, matched: 0, unmatched: 0, ambiguous: 0, opsEmitted: 0 };
608
+
609
+ for (const record of records) {
610
+ const match = config.match[record.objectType];
611
+ const fields = (config.fields[record.objectType] ?? []).filter(
612
+ (field) => field.from[source] !== undefined,
613
+ );
614
+ if (!match || fields.length === 0) {
615
+ counts.unmatched += 1;
616
+ unmatchedSourceIds.push(record.id);
617
+ continue;
618
+ }
619
+ const outcome = matchSourceRecord(snapshot, record.objectType, match.keys, record.keys);
620
+
621
+ if (outcome.status === "unmatched") {
622
+ counts.unmatched += 1;
623
+ unmatchedSourceIds.push(record.id);
624
+ continue;
625
+ }
626
+
627
+ if (outcome.status === "ambiguous") {
628
+ counts.ambiguous += 1;
629
+ ambiguities.push({ sourceRecordId: record.id, key: outcome.key, candidateIds: outcome.candidateIds });
630
+ if ((match.onAmbiguous ?? "skip") === "skip") continue;
631
+ // onAmbiguous: suggest — emit placeholder operations (one per candidate
632
+ // per field) so the existing suggest → plans approve --values-from /
633
+ // --value chain resolves the record selection. Apply refuses to write
634
+ // requires_human_* placeholders without an explicit value.
635
+ const recordEvidence = evidenceFor(source, sourceConfig.kind, sourceConfig.format, record, undefined, nowIso);
636
+ let emittedForRecord = false;
637
+ for (const field of fields) {
638
+ const sourceValue = sourceValueAt(record.payload, field.from[source]);
639
+ if (isEmptyValue(sourceValue)) continue;
640
+ const canonicalField = resolveCrmField(record.objectType, field.crm);
641
+ for (const candidateId of outcome.candidateIds) {
642
+ const currentValue = crmFieldValue(snapshot, record.objectType, candidateId, canonicalField);
643
+ if (!isEmptyValue(currentValue)) continue; // policy never: blanks only
644
+ emittedForRecord = true;
645
+ operations.push({
646
+ id: `op_enr_${fnv1a(`${source}:${record.objectType}:${candidateId}:${canonicalField}`)}`,
647
+ objectType: canonicalObjectType(record.objectType),
648
+ objectId: candidateId,
649
+ operation: "set_field",
650
+ field: canonicalField,
651
+ beforeValue: currentValue ?? null,
652
+ afterValue: PLACEHOLDER_RECORD_SELECTION,
653
+ reason:
654
+ `${source} record "${describeSourceRecord(record)}" matched ${outcome.candidateIds.length} CRM ` +
655
+ `records on ${outcome.key} (${outcome.candidateIds.join(", ")}). If ${candidateId} is the right ` +
656
+ `record, approve with --value <opId>=${JSON.stringify(valueToString(sourceValue))}.`,
657
+ sourceRuleOrPolicy: `enrich:${source}:${canonicalField}`,
658
+ riskLevel: "medium",
659
+ approvalRequired: true,
660
+ rollback: "Clear the field (the before value was empty) if the selection was wrong.",
661
+ evidenceIds: [recordEvidence.id],
662
+ });
663
+ counts.opsEmitted += 1;
664
+ }
665
+ }
666
+ if (emittedForRecord) evidence.push(recordEvidence);
667
+ continue;
668
+ }
669
+
670
+ // Matched.
671
+ counts.matched += 1;
672
+ const recordEvidence = evidenceFor(
673
+ source,
674
+ sourceConfig.kind,
675
+ sourceConfig.format,
676
+ record,
677
+ outcome.matchedKey,
678
+ nowIso,
679
+ );
680
+ let emittedForRecord = false;
681
+
682
+ for (const field of fields) {
683
+ const canonicalField = resolveCrmField(record.objectType, field.crm);
684
+ const sourceValue = sourceValueAt(record.payload, field.from[source]);
685
+ const currentValue = crmFieldValue(snapshot, record.objectType, outcome.recordId, canonicalField);
686
+ const cellKey = `${record.objectType}|${outcome.recordId}|${canonicalField}`;
687
+
688
+ if (mode === "refresh") {
689
+ // Refresh touches ONLY stamped, stale cells from the work set.
690
+ if (!workKeys.has(cellKey)) continue;
691
+ // Re-stamp every checked cell (changed or not): the staleness clock
692
+ // resets because the source was actually consulted.
693
+ stamps.push({
694
+ objectType: record.objectType,
695
+ objectId: outcome.recordId,
696
+ field: canonicalField,
697
+ enrichedAt: nowIso,
698
+ sourceRecordId: record.id,
699
+ value: sourceValue,
700
+ });
701
+ if (isEmptyValue(sourceValue)) continue; // source went blank: never propose clearing
702
+ if (sameValue(sourceValue, currentValue)) continue; // unchanged: no op
703
+ emittedForRecord = true;
704
+ operations.push({
705
+ id: `op_enr_${fnv1a(`${source}:${record.objectType}:${outcome.recordId}:${canonicalField}`)}`,
706
+ objectType: canonicalObjectType(record.objectType),
707
+ objectId: outcome.recordId,
708
+ operation: "set_field",
709
+ field: canonicalField,
710
+ beforeValue: currentValue ?? null,
711
+ afterValue: typeof sourceValue === "number" ? sourceValue : valueToString(sourceValue),
712
+ reason:
713
+ `${source} ${record.objectType} "${describeSourceRecord(record)}" (matched by ` +
714
+ `${outcome.matchedKey}) reports a changed value for ${canonicalField}.`,
715
+ sourceRuleOrPolicy: `enrich:${source}:${canonicalField}`,
716
+ riskLevel: isEmptyValue(currentValue) ? "low" : "medium",
717
+ approvalRequired: true,
718
+ rollback: "Restore the before value if the refreshed value is wrong.",
719
+ evidenceIds: [recordEvidence.id],
720
+ });
721
+ counts.opsEmitted += 1;
722
+ continue;
723
+ }
724
+
725
+ // Append: fill blanks only (policy "never").
726
+ if (isEmptyValue(sourceValue)) continue;
727
+ if (!isEmptyValue(currentValue)) continue;
728
+ emittedForRecord = true;
729
+ const afterValue = typeof sourceValue === "number" ? sourceValue : valueToString(sourceValue);
730
+ operations.push({
731
+ id: `op_enr_${fnv1a(`${source}:${record.objectType}:${outcome.recordId}:${canonicalField}`)}`,
732
+ objectType: canonicalObjectType(record.objectType),
733
+ objectId: outcome.recordId,
734
+ operation: "set_field",
735
+ field: canonicalField,
736
+ beforeValue: currentValue ?? null,
737
+ afterValue,
738
+ reason:
739
+ `${source} ${record.objectType} "${describeSourceRecord(record)}" (matched by ` +
740
+ `${outcome.matchedKey}) fills the blank ${canonicalField}.`,
741
+ sourceRuleOrPolicy: `enrich:${source}:${canonicalField}`,
742
+ riskLevel: "low",
743
+ approvalRequired: true,
744
+ rollback: "Clear the field (the before value was empty) if the enrichment is wrong.",
745
+ evidenceIds: [recordEvidence.id],
746
+ });
747
+ counts.opsEmitted += 1;
748
+ stamps.push({
749
+ objectType: record.objectType,
750
+ objectId: outcome.recordId,
751
+ field: canonicalField,
752
+ enrichedAt: nowIso,
753
+ sourceRecordId: record.id,
754
+ value: afterValue,
755
+ });
756
+ }
757
+ if (emittedForRecord) evidence.push(recordEvidence);
758
+ }
759
+
760
+ const plan: PatchPlan = {
761
+ id: `patch_plan_enr_${fnv1a(`${source}:${mode}:${runLabel}:${nowIso}`)}`,
762
+ title: `Enrichment ${mode} — ${source}`,
763
+ createdAt: nowIso,
764
+ status: operations.length > 0 ? "needs_approval" : "draft",
765
+ dryRun: true,
766
+ summary:
767
+ `${counts.opsEmitted} proposed operation(s) from ${source} ${mode} (${counts.fetched} source ` +
768
+ `record(s): ${counts.matched} matched, ${counts.unmatched} unmatched, ${counts.ambiguous} ambiguous). ` +
769
+ `Conflict policy: ${config.policy.overwrite}.`,
770
+ findings: [],
771
+ evidence,
772
+ operations,
773
+ };
774
+
775
+ return { plan, counts, stamps, ambiguities, unmatchedSourceIds };
776
+ }
777
+
778
+ // ---------------------------------------------------------------------------
779
+ // Staleness: compute the refresh work set from run-store stamps.
780
+
781
+ /** Latest stamp per (objectType, objectId, field) across a source's runs. */
782
+ export function latestStamps(runs: EnrichRun[], source: string): Map<string, EnrichStamp> {
783
+ const latest = new Map<string, EnrichStamp>();
784
+ for (const run of runs) {
785
+ if (run.source !== source) continue;
786
+ for (const stamp of run.stamps) {
787
+ const key = `${stamp.objectType}|${stamp.objectId}|${stamp.field}`;
788
+ const existing = latest.get(key);
789
+ if (!existing || existing.enrichedAt < stamp.enrichedAt) latest.set(key, stamp);
790
+ }
791
+ }
792
+ return latest;
793
+ }
794
+
795
+ export function staleDaysFor(config: EnrichConfig, objectType: EnrichObjectType, field: string): number {
796
+ const entry = (config.fields[objectType] ?? []).find(
797
+ (candidate) => resolveCrmField(objectType, candidate.crm) === field,
798
+ );
799
+ return entry?.staleDays ?? config.policy.defaultStaleDays ?? DEFAULT_STALE_DAYS;
800
+ }
801
+
802
+ /**
803
+ * Stale (record, field) cells: stamped by this source, refresh-eligible in
804
+ * the config, and older than the staleness window (per-field staleDays →
805
+ * policy.defaultStaleDays → 90; --stale-days overrides all).
806
+ */
807
+ export function selectStaleWork(
808
+ config: EnrichConfig,
809
+ runs: EnrichRun[],
810
+ source: string,
811
+ options: { now?: () => Date; staleDaysOverride?: number } = {},
812
+ ): EnrichWorkItem[] {
813
+ const now = (options.now ?? (() => new Date()))().getTime();
814
+ const work: EnrichWorkItem[] = [];
815
+ for (const stamp of latestStamps(runs, source).values()) {
816
+ const entry = (config.fields[stamp.objectType] ?? []).find(
817
+ (candidate) =>
818
+ resolveCrmField(stamp.objectType, candidate.crm) === stamp.field &&
819
+ candidate.from[source] !== undefined,
820
+ );
821
+ if (!entry?.refresh) continue;
822
+ const windowDays =
823
+ options.staleDaysOverride ?? entry.staleDays ?? config.policy.defaultStaleDays ?? DEFAULT_STALE_DAYS;
824
+ const ageDays = (now - Date.parse(stamp.enrichedAt)) / 86_400_000;
825
+ if (ageDays > windowDays) {
826
+ work.push({ objectType: stamp.objectType, objectId: stamp.objectId, field: stamp.field });
827
+ }
828
+ }
829
+ return work;
830
+ }
831
+
832
+ // ---------------------------------------------------------------------------
833
+ // Run store: append-only JSON files under the profile home, one per run label.
834
+ // Checkpoint (cursor), staleness ledger (stamps), and observability surface
835
+ // (enrich status) in one structure — mirrors the market observations store.
836
+
837
+ export type EnrichRunMode = EnrichMode | "ingest";
838
+
839
+ export type EnrichRun = {
840
+ id: string;
841
+ runLabel: string;
842
+ source: string;
843
+ mode: EnrichRunMode;
844
+ startedAt: string;
845
+ /** null while in progress — `status` surfaces it as an interrupted run. */
846
+ completedAt: string | null;
847
+ /** Resume point for an interrupted pull (last processed pull key). */
848
+ cursor: string | null;
849
+ counts: EnrichCounts;
850
+ planIds: string[];
851
+ stamps: EnrichStamp[];
852
+ /** Staged source rows (ingest mode only), consumed by append/refresh. */
853
+ staged?: Array<Record<string, unknown>>;
854
+ /** Object type of the staged rows (ingest mode only). */
855
+ stagedObjectType?: EnrichObjectType;
856
+ /**
857
+ * Source records pulled so far (api pulls with --save). Together with
858
+ * `cursor` this makes the checkpoint complete: a resumed run replays the
859
+ * already-paid-for payloads instead of re-fetching them.
860
+ */
861
+ pulled?: EnrichSourceRecord[];
862
+ /** Pull keys the source returned no data for (api pulls with --save). */
863
+ missedKeys?: string[];
864
+ /** Match collisions recorded for review (candidate ids included). */
865
+ ambiguities?: EnrichAmbiguity[];
866
+ };
867
+
868
+ export function enrichRunId(source: string, runLabel: string): string {
869
+ return `enr_${fnv1a(`${source}|${runLabel}`)}`;
870
+ }
871
+
872
+ export function enrichRunsDir(baseDir?: string): string {
873
+ return join(baseDir ?? credentialsDir(), "enrich", "runs");
874
+ }
875
+
876
+ export interface EnrichRunStore {
877
+ /** Append a new run; refuses an existing label (runs are append-only). */
878
+ append(run: EnrichRun): Promise<EnrichRun>;
879
+ /** Update an in-progress run (cursor checkpoint, finalization) in place. */
880
+ update(run: EnrichRun): Promise<EnrichRun>;
881
+ get(runLabel: string): Promise<EnrichRun | null>;
882
+ list(): Promise<EnrichRun[]>;
883
+ latest(filter?: { source?: string; mode?: EnrichRunMode }): Promise<EnrichRun | null>;
884
+ }
885
+
886
+ export function createFileEnrichRunStore(directory?: string): EnrichRunStore {
887
+ const dir = directory ?? enrichRunsDir();
888
+
889
+ function fileFor(runLabel: string) {
890
+ if (!/^[\w.-]+$/.test(runLabel)) throw new Error(`Invalid run label: ${runLabel}`);
891
+ return join(dir, `${runLabel}.json`);
892
+ }
893
+
894
+ function read(runLabel: string): EnrichRun | null {
895
+ try {
896
+ return JSON.parse(readFileSync(fileFor(runLabel), "utf8")) as EnrichRun;
897
+ } catch {
898
+ return null;
899
+ }
900
+ }
901
+
902
+ function write(run: EnrichRun): EnrichRun {
903
+ // Run files carry CRM record ids and source values; keep them owner-only
904
+ // like plan files (and lock the home down even before any login).
905
+ if (!directory) ensureSecureHomeDir();
906
+ mkdirSync(dir, { recursive: true, mode: 0o700 });
907
+ writeSecureFile(fileFor(run.runLabel), `${JSON.stringify(run, null, 2)}\n`);
908
+ return run;
909
+ }
910
+
911
+ function listRuns(): EnrichRun[] {
912
+ let names: string[] = [];
913
+ try {
914
+ names = readdirSync(dir).filter((name) => name.endsWith(".json"));
915
+ } catch {
916
+ return [];
917
+ }
918
+ return names
919
+ .map((name) => read(name.slice(0, -".json".length)))
920
+ .filter((run): run is EnrichRun => run !== null)
921
+ .sort((a, b) => a.startedAt.localeCompare(b.startedAt));
922
+ }
923
+
924
+ return {
925
+ async append(run) {
926
+ if (read(run.runLabel)) {
927
+ throw new Error(
928
+ `Run "${run.runLabel}" already exists — enrich runs are append-only; use a new run label`,
929
+ );
930
+ }
931
+ return write(run);
932
+ },
933
+ async update(run) {
934
+ const existing = read(run.runLabel);
935
+ if (!existing) throw new Error(`No enrich run "${run.runLabel}" to update`);
936
+ if (existing.id !== run.id) {
937
+ throw new Error(`Run "${run.runLabel}" belongs to a different run id (${existing.id})`);
938
+ }
939
+ return write(run);
940
+ },
941
+ async get(runLabel) {
942
+ return read(runLabel);
943
+ },
944
+ async list() {
945
+ return listRuns();
946
+ },
947
+ async latest(filter = {}) {
948
+ const runs = listRuns().filter(
949
+ (run) =>
950
+ (filter.source === undefined || run.source === filter.source) &&
951
+ (filter.mode === undefined || run.mode === filter.mode),
952
+ );
953
+ return runs.length ? runs[runs.length - 1] : null;
954
+ },
955
+ };
956
+ }
957
+
958
+ // ---------------------------------------------------------------------------
959
+ // Ingest staging helpers
960
+
961
+ /**
962
+ * Infer the object type of staged rows from the configured match keys: the
963
+ * type whose key columns actually appear on the rows. Exactly one hit wins;
964
+ * zero or two is an error asking for --objects.
965
+ */
966
+ export function inferIngestObjectType(
967
+ config: EnrichConfig,
968
+ source: string,
969
+ rows: Array<Record<string, unknown>>,
970
+ ): EnrichObjectType {
971
+ const sample = rows[0] ?? {};
972
+ const hits = OBJECT_TYPES.filter((objectType) => {
973
+ const match = config.match[objectType];
974
+ const fields = (config.fields[objectType] ?? []).some((field) => field.from[source] !== undefined);
975
+ if (!match || !fields) return false;
976
+ return match.keys.some((key) => ingestKeyValue(sample, key) !== undefined);
977
+ });
978
+ if (hits.length === 1) return hits[0];
979
+ if (hits.length === 0) {
980
+ throw new Error(
981
+ `enrich ingest: cannot tell whether these rows are companies or contacts — no configured match key ` +
982
+ `column found. Pass --objects companies|contacts, or add the key column to the export.`,
983
+ );
984
+ }
985
+ throw new Error(
986
+ `enrich ingest: rows carry match keys for ${hits.join(" and ")} — pass --objects companies|contacts to disambiguate.`,
987
+ );
988
+ }
989
+
990
+ /** Turn staged ingest rows into source records for the matcher. */
991
+ export function stagedSourceRecords(
992
+ config: EnrichConfig,
993
+ source: string,
994
+ run: EnrichRun,
995
+ ): EnrichSourceRecord[] {
996
+ const objectType = run.stagedObjectType;
997
+ const rows = run.staged ?? [];
998
+ if (!objectType || rows.length === 0) {
999
+ throw new Error(
1000
+ `enrich: run "${run.runLabel}" has no staged data — stage a Clay export first: ` +
1001
+ `fullstackgtm enrich ingest <file.csv|payload.json> --source ${source}`,
1002
+ );
1003
+ }
1004
+ const match = config.match[objectType];
1005
+ if (!match) throw new Error(`enrich: no match config for ${objectType}`);
1006
+ return rows.map((row, index) => {
1007
+ const keys: Record<string, string | undefined> = {};
1008
+ for (const key of match.keys) keys[key] = ingestKeyValue(row, key);
1009
+ return {
1010
+ id: `${source}:${run.runLabel}:row-${index + 1}`,
1011
+ objectType,
1012
+ keys,
1013
+ payload: row,
1014
+ };
1015
+ });
1016
+ }