fullstackgtm 0.21.2 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ import type { CanonicalGtmSnapshot, PatchPlan } from "./types.ts";
2
+ export type DedupeOptions = {
3
+ objectType: "account" | "contact" | "deal";
4
+ /** identity key records are grouped by (normalized before grouping) */
5
+ key: "domain" | "email" | "name";
6
+ /** survivor selection — deterministic either way (default "richest") */
7
+ keep?: "richest" | "oldest";
8
+ reason?: string;
9
+ /** refuse to build plans larger than this (default 500 operations) */
10
+ maxOperations?: number;
11
+ };
12
+ /** Normalize a record's identity key; undefined when the field is empty. */
13
+ export declare function dedupeKey(record: Record<string, unknown>, key: DedupeOptions["key"]): string | undefined;
14
+ export declare function buildDedupePlan(snapshot: CanonicalGtmSnapshot, options: DedupeOptions): PatchPlan;
package/dist/dedupe.js ADDED
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Governed duplicate cleanup: `dedupe` groups records by a normalized
3
+ * identity key (account domain, contact email, or name) and builds a
4
+ * dry-run PatchPlan of merge_records operations — one per duplicate group,
5
+ * with a DETERMINISTIC survivor. It NEVER writes — the plan flows through
6
+ * the same plans-approve → apply gate as every other plan.
7
+ *
8
+ * The merge contract matches the connectors (see mergeRecords in
9
+ * connectors/hubspot.ts): afterValue = the survivor id, beforeValue = the
10
+ * ids of EVERY record in the group (survivor included). Merges are
11
+ * IRREVERSIBLE on every provider that supports them, so every operation is
12
+ * riskLevel high and approvalRequired.
13
+ *
14
+ * Survivor selection ("--keep"):
15
+ * richest (default) the record with the most non-empty canonical data
16
+ * fields (bookkeeping fields like id/crmId/identities
17
+ * don't count); ties break to the lowest numeric id
18
+ * oldest the lowest numeric id (CRMs assign ids in creation
19
+ * order)
20
+ */
21
+ import { normalizeDomain } from "./merge.js";
22
+ import { stableHash } from "./rules.js";
23
+ const COLLECTIONS = {
24
+ account: "accounts",
25
+ contact: "contacts",
26
+ deal: "deals",
27
+ };
28
+ /** Which identity keys make sense per object type. */
29
+ const VALID_KEYS = {
30
+ account: ["domain", "name"],
31
+ contact: ["email", "name"],
32
+ deal: ["name"],
33
+ };
34
+ /**
35
+ * Bookkeeping fields excluded from the richness count: they are populated
36
+ * (or not) by the sync machinery, not by the quality of the record's data,
37
+ * so counting them would let plumbing decide which record survives a merge.
38
+ */
39
+ const NON_DATA_FIELDS = new Set(["id", "provider", "crmId", "identities", "raw", "provenance"]);
40
+ function populatedDataFields(record) {
41
+ return Object.entries(record).filter(([field, value]) => !NON_DATA_FIELDS.has(field) && value !== undefined && value !== null && value !== "").length;
42
+ }
43
+ /** True when id `a` sorts before id `b` — numeric when both ids are numeric. */
44
+ function idBefore(a, b) {
45
+ const numericA = Number(a);
46
+ const numericB = Number(b);
47
+ if (Number.isFinite(numericA) && Number.isFinite(numericB) && numericA !== numericB) {
48
+ return numericA < numericB;
49
+ }
50
+ return a < b;
51
+ }
52
+ /** Normalize a record's identity key; undefined when the field is empty. */
53
+ export function dedupeKey(record, key) {
54
+ if (key === "domain")
55
+ return normalizeDomain(record.domain);
56
+ const raw = record[key];
57
+ if (raw === undefined || raw === null)
58
+ return undefined;
59
+ const normalized = String(raw).trim().toLowerCase();
60
+ return normalized || undefined;
61
+ }
62
+ export function buildDedupePlan(snapshot, options) {
63
+ const keep = options.keep ?? "richest";
64
+ const maxOperations = options.maxOperations ?? 500;
65
+ if (!VALID_KEYS[options.objectType].includes(options.key)) {
66
+ throw new Error(`Cannot dedupe ${COLLECTIONS[options.objectType]} by "${options.key}". Valid keys for ${options.objectType}s: ${VALID_KEYS[options.objectType].join(", ")}.`);
67
+ }
68
+ if (keep !== "richest" && keep !== "oldest") {
69
+ throw new Error(`--keep must be richest or oldest, got "${keep}".`);
70
+ }
71
+ const records = snapshot[COLLECTIONS[options.objectType]];
72
+ const groups = new Map();
73
+ for (const record of records) {
74
+ const key = dedupeKey(record, options.key);
75
+ if (!key)
76
+ continue; // records without the identity key cannot be duplicates by it
77
+ const existing = groups.get(key) ?? [];
78
+ existing.push(record);
79
+ groups.set(key, existing);
80
+ }
81
+ for (const [key, members] of Array.from(groups.entries())) {
82
+ if (members.length < 2)
83
+ groups.delete(key);
84
+ }
85
+ if (groups.size > maxOperations) {
86
+ throw new Error(`Found ${groups.size} duplicate groups — above the ${maxOperations}-group safety cap. Raise --max-operations explicitly after reviewing the volume.`);
87
+ }
88
+ const operations = [];
89
+ let duplicateRecordCount = 0;
90
+ for (const [key, members] of groups) {
91
+ duplicateRecordCount += members.length;
92
+ // deterministic survivor: richest data first (ties to lowest id), or
93
+ // simply the lowest id when keeping the oldest
94
+ const survivor = [...members].sort((a, b) => {
95
+ if (keep === "richest") {
96
+ const richness = populatedDataFields(b) - populatedDataFields(a);
97
+ if (richness !== 0)
98
+ return richness;
99
+ }
100
+ return idBefore(String(a.id), String(b.id)) ? -1 : 1;
101
+ })[0];
102
+ const groupIds = members
103
+ .map((member) => String(member.id))
104
+ .sort((a, b) => (idBefore(a, b) ? -1 : 1));
105
+ const survivorName = typeof survivor.name === "string" && survivor.name
106
+ ? survivor.name
107
+ : typeof survivor.email === "string" && survivor.email
108
+ ? survivor.email
109
+ : String(survivor.id);
110
+ const keepDetail = keep === "richest"
111
+ ? `${populatedDataFields(survivor)} populated data fields, the most in the group (ties break to the lowest id)`
112
+ : "the lowest id in the group (oldest record)";
113
+ operations.push({
114
+ id: `op_${stableHash(`dedupe:${options.objectType}:${options.key}:${groupIds.join(",")}`)}`,
115
+ objectType: options.objectType,
116
+ objectId: String(survivor.id),
117
+ operation: "merge_records",
118
+ field: "merge",
119
+ beforeValue: groupIds,
120
+ afterValue: String(survivor.id),
121
+ reason: options.reason ??
122
+ `${members.length} ${COLLECTIONS[options.objectType]} share ${options.key} "${key}". Merge into "${survivorName}" (${survivor.id}) — survivor has ${keepDetail}.`,
123
+ riskLevel: "high",
124
+ approvalRequired: true,
125
+ sourceRuleOrPolicy: "dedupe",
126
+ groupId: `grp_${options.objectType}_${String(survivor.id)}`,
127
+ rollback: "IRREVERSIBLE: provider merges cannot be unmerged. The pre-apply snapshot retains every record's field values; recreate a record manually from it if a merge was wrong.",
128
+ });
129
+ }
130
+ return {
131
+ id: `patch_plan_${stableHash(`dedupe:${snapshot.provider}:${snapshot.generatedAt}:${options.objectType}:${options.key}:${keep}:${operations.length}`)}`,
132
+ title: `Dedupe: ${COLLECTIONS[options.objectType]} sharing the same ${options.key}`,
133
+ createdAt: snapshot.generatedAt,
134
+ status: operations.length > 0 ? "needs_approval" : "draft",
135
+ dryRun: true,
136
+ summary: `${groups.size} duplicate group(s) across ${duplicateRecordCount} ${COLLECTIONS[options.objectType]} (key: ${options.key}, keep: ${keep}); ${operations.length} proposed dry-run merge_records operation(s). Merges are IRREVERSIBLE — review each survivor before approving.`,
137
+ findings: [],
138
+ operations,
139
+ };
140
+ }
@@ -0,0 +1,220 @@
1
+ import type { CanonicalGtmSnapshot, PatchPlan } from "./types.ts";
2
+ /**
3
+ * The enrich layer: governed append/refresh of third-party data into the CRM.
4
+ *
5
+ * Every enrichment vendor ships fire-and-forget writeback — data lands without
6
+ * a diff, without approval, over whatever a human typed. This layer inverts
7
+ * that: a source (Apollo pull, Clay ingest) feeds a deterministic matcher,
8
+ * the matcher feeds a fill-blanks-only patch plan, and the plan goes through
9
+ * the existing dry-run → approval → apply contract. Every proposed value is
10
+ * traceable to the source payload that produced it (`GtmEvidence` on the
11
+ * plan), and every write carries a `beforeValue` for apply-time
12
+ * compare-and-set.
13
+ *
14
+ * State lives in a profile-scoped, append-only run store
15
+ * (`~/.fullstackgtm/profiles/<profile>/enrich/runs/`) that is checkpoint,
16
+ * staleness ledger, and observability surface in one. The CLI never writes
17
+ * `fsgtm_enriched_at`-style custom properties into the customer's portal.
18
+ *
19
+ * Recurring execution belongs to the horizontal scheduler (docs/schedule.md);
20
+ * enrich owns no cron logic.
21
+ */
22
+ export type EnrichObjectType = "company" | "contact";
23
+ export type EnrichSourceKind = "api" | "ingest";
24
+ export type EnrichSourceConfig = {
25
+ kind: EnrichSourceKind;
26
+ /** Ingest staging format; csv (column headers) or json (dotted paths). */
27
+ format?: "csv" | "json";
28
+ };
29
+ export type EnrichAmbiguousPolicy = "skip" | "suggest";
30
+ export type EnrichMatchConfig = {
31
+ /** Ordered match keys, evaluated against the snapshot. */
32
+ keys: string[];
33
+ /** Multi-hit behavior; default skip. */
34
+ onAmbiguous?: EnrichAmbiguousPolicy;
35
+ };
36
+ export type EnrichFieldConfig = {
37
+ /** CRM property: canonical field name, or a default HubSpot property name. */
38
+ crm: string;
39
+ /** sourceId → dotted JSON path (api/json) or column header (ingest csv). */
40
+ from: Record<string, string>;
41
+ /** Opt into `enrich refresh`; fields without it are set once, never revisited. */
42
+ refresh?: boolean;
43
+ /** Staleness window for refresh; falls back to policy.defaultStaleDays. */
44
+ staleDays?: number;
45
+ /** Per-field conflict policy override. MVP: only "never". */
46
+ policy?: "never";
47
+ };
48
+ export type EnrichPolicyConfig = {
49
+ /** Conflict policy ladder. MVP ships "never" (fill blanks only). */
50
+ overwrite: "never";
51
+ defaultStaleDays?: number;
52
+ };
53
+ export type EnrichConfig = {
54
+ sources: Record<string, EnrichSourceConfig>;
55
+ match: Partial<Record<EnrichObjectType, EnrichMatchConfig>>;
56
+ fields: Partial<Record<EnrichObjectType, EnrichFieldConfig[]>>;
57
+ policy: EnrichPolicyConfig;
58
+ };
59
+ export declare const ENRICH_CONFIG_FILE_NAME = "enrich.config.json";
60
+ export declare const DEFAULT_STALE_DAYS = 90;
61
+ /** API source ids the MVP can pull from. */
62
+ export declare const SUPPORTED_API_SOURCES: string[];
63
+ /** Resolve a config `crm` field name to the canonical snapshot field. */
64
+ export declare function resolveCrmField(objectType: EnrichObjectType, name: string): string;
65
+ /**
66
+ * Strict, up-front validation (the 0.18 lesson: a config crash mid-run is
67
+ * worse than a refused config). Every problem names the offending entry and
68
+ * the accepted values.
69
+ */
70
+ export declare function parseEnrichConfig(raw: string): EnrichConfig;
71
+ export declare function loadEnrichConfig(path: string): EnrichConfig;
72
+ export declare function parseCsv(text: string): Array<Record<string, string>>;
73
+ export type EnrichSourceRecord = {
74
+ /** e.g. "apollo:org_abc", "clay:row-3". Lands on stamps as sourceRecordId. */
75
+ id: string;
76
+ objectType: EnrichObjectType;
77
+ /** Match-key values (key name → raw value), extracted by the source adapter. */
78
+ keys: Record<string, string | undefined>;
79
+ /** Raw source payload; field paths and evidence excerpts read from it. */
80
+ payload: Record<string, unknown>;
81
+ };
82
+ /** Read a value from a payload: exact key first (CSV headers), then dotted path. */
83
+ export declare function sourceValueAt(payload: Record<string, unknown>, path: string): unknown;
84
+ /** Case-insensitive header lookup for ingest rows ("Email" matches key "email"). */
85
+ export declare function ingestKeyValue(row: Record<string, unknown>, key: string): string | undefined;
86
+ export type MatchOutcome = {
87
+ status: "matched";
88
+ recordId: string;
89
+ matchedKey: string;
90
+ } | {
91
+ status: "unmatched";
92
+ } | {
93
+ status: "ambiguous";
94
+ key: string;
95
+ candidateIds: string[];
96
+ };
97
+ export declare function matchSourceRecord(snapshot: CanonicalGtmSnapshot, objectType: EnrichObjectType, keys: string[], sourceKeys: Record<string, string | undefined>): MatchOutcome;
98
+ export type EnrichMode = "append" | "refresh";
99
+ export type EnrichCounts = {
100
+ fetched: number;
101
+ matched: number;
102
+ unmatched: number;
103
+ ambiguous: number;
104
+ opsEmitted: number;
105
+ };
106
+ export type EnrichStamp = {
107
+ objectType: EnrichObjectType;
108
+ objectId: string;
109
+ /** Canonical field name. */
110
+ field: string;
111
+ enrichedAt: string;
112
+ sourceRecordId: string;
113
+ /** Source value at stamp time (refresh change-detection observability). */
114
+ value?: unknown;
115
+ };
116
+ export type EnrichAmbiguity = {
117
+ sourceRecordId: string;
118
+ key: string;
119
+ candidateIds: string[];
120
+ };
121
+ export type EnrichWorkItem = {
122
+ objectType: EnrichObjectType;
123
+ objectId: string;
124
+ /** Canonical field name. */
125
+ field: string;
126
+ };
127
+ export type BuildEnrichPlanOptions = {
128
+ config: EnrichConfig;
129
+ source: string;
130
+ mode: EnrichMode;
131
+ snapshot: CanonicalGtmSnapshot;
132
+ records: EnrichSourceRecord[];
133
+ /**
134
+ * Refresh only: the stale (record, field) work set computed from run-store
135
+ * stamps. Refresh proposes writes ONLY for work-set cells — fields the
136
+ * ledger proves enrich itself stamped — so policy "never" still never
137
+ * overwrites a value enrich did not put there.
138
+ */
139
+ workSet?: EnrichWorkItem[];
140
+ now?: () => Date;
141
+ runLabel: string;
142
+ };
143
+ export type EnrichPlanResult = {
144
+ plan: PatchPlan;
145
+ counts: EnrichCounts;
146
+ stamps: EnrichStamp[];
147
+ ambiguities: EnrichAmbiguity[];
148
+ unmatchedSourceIds: string[];
149
+ };
150
+ /**
151
+ * Match source records against the snapshot and emit a patch plan under the
152
+ * conflict policy. Append fills blanks only; refresh proposes updates for
153
+ * stale stamped fields whose source value actually changed (beforeValue =
154
+ * current CRM value → apply-time compare-and-set rejects drifted records).
155
+ */
156
+ export declare function buildEnrichPlan(options: BuildEnrichPlanOptions): EnrichPlanResult;
157
+ /** Latest stamp per (objectType, objectId, field) across a source's runs. */
158
+ export declare function latestStamps(runs: EnrichRun[], source: string): Map<string, EnrichStamp>;
159
+ export declare function staleDaysFor(config: EnrichConfig, objectType: EnrichObjectType, field: string): number;
160
+ /**
161
+ * Stale (record, field) cells: stamped by this source, refresh-eligible in
162
+ * the config, and older than the staleness window (per-field staleDays →
163
+ * policy.defaultStaleDays → 90; --stale-days overrides all).
164
+ */
165
+ export declare function selectStaleWork(config: EnrichConfig, runs: EnrichRun[], source: string, options?: {
166
+ now?: () => Date;
167
+ staleDaysOverride?: number;
168
+ }): EnrichWorkItem[];
169
+ export type EnrichRunMode = EnrichMode | "ingest";
170
+ export type EnrichRun = {
171
+ id: string;
172
+ runLabel: string;
173
+ source: string;
174
+ mode: EnrichRunMode;
175
+ startedAt: string;
176
+ /** null while in progress — `status` surfaces it as an interrupted run. */
177
+ completedAt: string | null;
178
+ /** Resume point for an interrupted pull (last processed pull key). */
179
+ cursor: string | null;
180
+ counts: EnrichCounts;
181
+ planIds: string[];
182
+ stamps: EnrichStamp[];
183
+ /** Staged source rows (ingest mode only), consumed by append/refresh. */
184
+ staged?: Array<Record<string, unknown>>;
185
+ /** Object type of the staged rows (ingest mode only). */
186
+ stagedObjectType?: EnrichObjectType;
187
+ /**
188
+ * Source records pulled so far (api pulls with --save). Together with
189
+ * `cursor` this makes the checkpoint complete: a resumed run replays the
190
+ * already-paid-for payloads instead of re-fetching them.
191
+ */
192
+ pulled?: EnrichSourceRecord[];
193
+ /** Pull keys the source returned no data for (api pulls with --save). */
194
+ missedKeys?: string[];
195
+ /** Match collisions recorded for review (candidate ids included). */
196
+ ambiguities?: EnrichAmbiguity[];
197
+ };
198
+ export declare function enrichRunId(source: string, runLabel: string): string;
199
+ export declare function enrichRunsDir(baseDir?: string): string;
200
+ export interface EnrichRunStore {
201
+ /** Append a new run; refuses an existing label (runs are append-only). */
202
+ append(run: EnrichRun): Promise<EnrichRun>;
203
+ /** Update an in-progress run (cursor checkpoint, finalization) in place. */
204
+ update(run: EnrichRun): Promise<EnrichRun>;
205
+ get(runLabel: string): Promise<EnrichRun | null>;
206
+ list(): Promise<EnrichRun[]>;
207
+ latest(filter?: {
208
+ source?: string;
209
+ mode?: EnrichRunMode;
210
+ }): Promise<EnrichRun | null>;
211
+ }
212
+ export declare function createFileEnrichRunStore(directory?: string): EnrichRunStore;
213
+ /**
214
+ * Infer the object type of staged rows from the configured match keys: the
215
+ * type whose key columns actually appear on the rows. Exactly one hit wins;
216
+ * zero or two is an error asking for --objects.
217
+ */
218
+ export declare function inferIngestObjectType(config: EnrichConfig, source: string, rows: Array<Record<string, unknown>>): EnrichObjectType;
219
+ /** Turn staged ingest rows into source records for the matcher. */
220
+ export declare function stagedSourceRecords(config: EnrichConfig, source: string, run: EnrichRun): EnrichSourceRecord[];