hubspot-cms-sync 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +52 -0
  3. package/bin/hubspot-cms-sync.mjs +115 -0
  4. package/docs/CONFIGURATION.md +83 -0
  5. package/docs/GITHUB_ACTIONS.md +70 -0
  6. package/docs/MIGRATION_PLAN.md +361 -0
  7. package/docs/PLAN_REVIEW.md +42 -0
  8. package/docs/SKILL_DISTRIBUTION.md +79 -0
  9. package/examples/github-actions/ci.yml +56 -0
  10. package/examples/github-actions/preview.yml +71 -0
  11. package/examples/github-actions/publish.yml +82 -0
  12. package/examples/hubspot-cms-sync.config.mjs +45 -0
  13. package/examples/site.manifest.json +19 -0
  14. package/package.json +41 -0
  15. package/skill/SKILL.md +54 -0
  16. package/skill/references/commands.md +54 -0
  17. package/skill/references/config.md +25 -0
  18. package/skill/references/failures.md +58 -0
  19. package/skill/references/github-actions.md +56 -0
  20. package/skill/references/screenshots-and-fidelity.md +33 -0
  21. package/src/adapters/assets.mjs +576 -0
  22. package/src/adapters/blog.mjs +921 -0
  23. package/src/adapters/content.mjs +213 -0
  24. package/src/adapters/forms.mjs +569 -0
  25. package/src/adapters/pages.mjs +463 -0
  26. package/src/adapters/theme.mjs +503 -0
  27. package/src/config.mjs +113 -0
  28. package/src/corpus-scan.mjs +248 -0
  29. package/src/cta-inventory.mjs +352 -0
  30. package/src/index.mjs +3 -0
  31. package/src/lib/canonical.mjs +234 -0
  32. package/src/lib/hub.mjs +197 -0
  33. package/src/lib/orchestrate.mjs +141 -0
  34. package/src/lib/refs.mjs +398 -0
  35. package/src/lib/sync-state.mjs +86 -0
  36. package/src/manifest.mjs +353 -0
  37. package/src/preflight.mjs +385 -0
  38. package/src/pull.mjs +99 -0
  39. package/src/push.mjs +354 -0
  40. package/src/republish.mjs +102 -0
@@ -0,0 +1,33 @@
1
+ # Screenshots And Fidelity Reference
2
+
3
+ Use the consuming repo's configured verification commands before inventing new
4
+ checks. The config usually exposes them under `verification.commands`.
5
+
6
+ ## Before Capture
7
+
8
+ 1. Confirm the preview or production base URL from the configured env var.
9
+ 2. Confirm the target has been published or republished.
10
+ 3. Run link and form checks when the repo provides them.
11
+
12
+ ## Screenshot Workflow
13
+
14
+ Use Playwright or the repo's chosen browser test runner.
15
+
16
+ ```bash
17
+ the consuming repo verification commands
18
+ npx playwright test verify/fidelity.spec.mjs
19
+ ```
20
+
21
+ Compare screenshots against the repo's accepted baselines. If baselines need to
22
+ change, keep that diff separate from content sync changes when possible.
23
+
24
+ ## Reporting
25
+
26
+ Report:
27
+
28
+ - target name and base URL
29
+ - pages checked
30
+ - failed selectors, links, forms, or screenshot names
31
+ - artifact paths
32
+ - checks skipped because credentials, base URLs, or browser dependencies were
33
+ unavailable
@@ -0,0 +1,576 @@
1
+ // sync/adapters/assets.mjs — File Manager image sync for pages + blog.
2
+ //
3
+ // CODEX FINDING #4 (the contract this adapter exists to enforce):
4
+ // Canonical content committed to git stores REPO ASSET PATHS / logical
5
+ // `@asset:<path>` keys, NEVER hosted URLs. The per-account
6
+ // portal -> hostedURL map is volatile state living in
7
+ // `.sync-state/<portalId>.rehosted.json` (gitignored), NOT committed.
8
+ //
9
+ // HOW THE @asset KEY IS DEFINED (must agree with sync/lib/refs.mjs):
10
+ // refs.mjs collapses any `…/hubfs/<portal>/<pathTail>` URL into the single
11
+ // token `@asset:<pathTail>` (the portal + host are discarded — they are
12
+ // per-account). That `<pathTail>` (e.g. `Sucess.jpg`,
13
+ // `Stock%20images/Double%20exposure.jpeg`) is at once:
14
+ // • the logical registry key (registry.assets[<pathTail>])
15
+ // • the repo path under content/assets/<pathTail> (bytes committed)
16
+ // We keep the tail BYTE-FOR-BYTE (including any %20) so the on-disk path, the
17
+ // registry key, and the `@asset:` token are the same string and round-trip.
18
+ //
19
+ // PULL (read source acct -> write canonical bytes + register source URLs):
20
+ // 1. scan canonical content (pages/*.json, pages/*.widgets.json, blog/**)
21
+ // for `@asset:<path>` tokens — these were produced by refs.canonicalize.
22
+ // 2. for each path, find a downloadable URL ON THE SOURCE ACCOUNT
23
+ // (File Manager search by name, hubfs reconstruction fallback) and
24
+ // download the bytes to content/assets/<path> (COMMIT these bytes).
25
+ // 3. record source-URL -> @asset in the registry (registry.assets[path] =
26
+ // sourceURL) and mirror it to .sync-state/<portalId>.rehosted.json.
27
+ //
28
+ // PUSH (read committed bytes -> upload to target -> register target URLs):
29
+ // for each content/assets/<path>, upload to the TARGET File Manager with
30
+ // OVERWRITE (codex #4: the legacy overwrite:false made duplicates), then
31
+ // record @asset -> target hosted URL in registry.assets[path] so the
32
+ // content / blog / theme adapters can resolve() their `@asset:` tokens to a
33
+ // concrete URL. dependsOn: [] — assets POPULATE the registry, depend on no
34
+ // other adapter.
35
+ //
36
+ // READ-ONLY PROD (529456): this adapter never hardcodes a portal; the
37
+ // orchestrator passes `acct`. push() writes to whatever `acct` it is given;
38
+ // the orchestrator is responsible for never passing prod to a push.
39
+
40
+ import {
41
+ readFileSync,
42
+ writeFileSync,
43
+ renameSync,
44
+ mkdirSync,
45
+ existsSync,
46
+ readdirSync,
47
+ statSync,
48
+ } from 'node:fs';
49
+ import { join, dirname, resolve as pathResolve } from 'node:path';
50
+ import { homedir } from 'node:os';
51
+ import { fileURLToPath } from 'node:url';
52
+
53
+ import { hub } from '../lib/hub.mjs';
54
+ import { stableStringify } from '../lib/canonical.mjs';
55
+
56
+ const API = 'https://api.hubapi.com';
57
+
58
+ export const name = 'assets';
59
+ // Assets POPULATE the registry (logical -> hosted url) for everyone else.
60
+ // Nothing has to run before assets, so this is empty.
61
+ export const dependsOn = [];
62
+
63
+ // Folder under the target File Manager that re-hosted assets live in. A single
64
+ // flat-ish namespace keeps overwrite-by-path deterministic across runs.
65
+ const TARGET_FOLDER = '/synced-assets';
66
+
67
+ // ───────────────────────────────────────────────────────────────────────────
68
+ // PURE: path <-> logical mapping. `@asset:<path>` <-> content/assets/<path>.
69
+ // Exported for unit testing (no network).
70
+ // ───────────────────────────────────────────────────────────────────────────
71
+
72
+ const ASSET_TOKEN_RE = /@asset:([^\s"'\\)]+)/g;
73
+
74
+ /**
75
+ * assetTokenToPath('@asset:Sucess.jpg') -> 'Sucess.jpg'
76
+ * Also accepts a bare path tail (idempotent). Returns null for anything that
77
+ * is not an @asset token / path.
78
+ */
79
+ export function assetTokenToPath(token) {
80
+ if (typeof token !== 'string' || token.length === 0) return null;
81
+ const m = token.match(/^@asset:([^\s"'\\)]+)$/);
82
+ if (m) return m[1];
83
+ // already a bare path tail
84
+ if (token.startsWith('@')) return null;
85
+ return token;
86
+ }
87
+
88
+ /** pathToAssetToken('Sucess.jpg') -> '@asset:Sucess.jpg' */
89
+ export function pathToAssetToken(path) {
90
+ return `@asset:${path}`;
91
+ }
92
+
93
+ /**
94
+ * assetRepoPath(contentDir, '<pathTail>') -> absolute file path under
95
+ * content/assets/<pathTail>. The tail is kept verbatim (slashes become real
96
+ * sub-directories) so it matches the `@asset:` token and the registry key.
97
+ */
98
+ export function assetRepoPath(contentDir, path) {
99
+ return join(contentDir, 'assets', path);
100
+ }
101
+
102
+ // ───────────────────────────────────────────────────────────────────────────
103
+ // ASSET-SCHEME UNIFICATION (codex #6).
104
+ //
105
+ // Two adapters emit `@asset:<key>` tokens with DIFFERENT committed-bytes trees:
106
+ // • the assets adapter: key = the hubfs path tail (e.g. `Sucess.jpg`),
107
+ // bytes committed at content/assets/<key>.
108
+ // • the blog adapter: key = a sha1-prefixed manifest filename
109
+ // (e.g. `4e7bf9bad5-Inbox.png`), bytes committed at
110
+ // content/blog/assets/<key> (blog.rehostAssets uploads these itself).
111
+ //
112
+ // A blog-manifest `@asset` is therefore a legitimate, satisfiable ref whose
113
+ // bytes live OUTSIDE content/assets/. The single source of truth for "where can
114
+ // an @asset key's committed bytes live" is `assetRepoCandidates` below; both the
115
+ // assets adapter and the push preflight consult it, so the two schemes resolve and
116
+ // preflight identically. We RECOGNIZE both trees rather than migrate blog bytes:
117
+ // migrating would have to rewrite the manifest keys + blog.pull tokenization + move
118
+ // 51 committed files (and re-key registry.assets), all of which the blog adapter
119
+ // owns. Recognition is purely additive and keeps each adapter's bytes where it
120
+ // already commits them. (See docs note in the unification report.)
121
+ //
122
+ // The blog tree's name is centralized here so the preflight need not hard-code it.
123
+ export const BLOG_ASSETS_REL = ['blog', 'assets'];
124
+
125
+ /**
126
+ * blogAssetRepoPath(contentDir, '<key>') -> absolute path under
127
+ * content/blog/assets/<key> (the blog adapter's manifest byte tree).
128
+ */
129
+ export function blogAssetRepoPath(contentDir, path) {
130
+ return join(contentDir, ...BLOG_ASSETS_REL, path);
131
+ }
132
+
133
+ /**
134
+ * assetRepoCandidates(contentDir, '<key>') -> the ordered list of absolute file
135
+ * paths where an @asset key's committed bytes may live, across BOTH schemes:
136
+ * 1. content/assets/<key> (assets adapter — hubfs tail)
137
+ * 2. content/blog/assets/<key> (blog adapter — manifest filename)
138
+ * Pure (no I/O). Callers test each with existsSync.
139
+ */
140
+ export function assetRepoCandidates(contentDir, path) {
141
+ return [assetRepoPath(contentDir, path), blogAssetRepoPath(contentDir, path)];
142
+ }
143
+
144
+ /**
145
+ * resolveAssetBytesPath(contentDir, '<key>', existsFn) -> the first candidate
146
+ * path (assets tree, then blog tree) whose bytes are committed, or null if an
147
+ * @asset key has committed bytes in NEITHER tree. `existsFn` defaults to fs
148
+ * existsSync but is injectable so the push preflight can pass its fake fs.
149
+ * This is the one function that unifies the two @asset schemes for "are the
150
+ * bytes here?" — used by both the assets adapter (push) and the preflight.
151
+ */
152
+ export function resolveAssetBytesPath(contentDir, path, existsFn = existsSync) {
153
+ for (const cand of assetRepoCandidates(contentDir, path)) {
154
+ if (existsFn(cand)) return cand;
155
+ }
156
+ return null;
157
+ }
158
+
159
+ /**
160
+ * extractAssetPaths(str) -> string[] of unique `<pathTail>`s referenced by
161
+ * `@asset:` tokens in the given canonical string. Pure.
162
+ */
163
+ export function extractAssetPaths(str) {
164
+ if (typeof str !== 'string' || str.length === 0) return [];
165
+ const out = new Set();
166
+ for (const m of str.matchAll(ASSET_TOKEN_RE)) out.add(m[1]);
167
+ return [...out];
168
+ }
169
+
170
+ // ───────────────────────────────────────────────────────────────────────────
171
+ // PURE: File Manager upload options. The codex #4 fix lives here — OVERWRITE.
172
+ // Exported so a unit test can assert overwrite:true without any network.
173
+ // ───────────────────────────────────────────────────────────────────────────
174
+
175
+ /**
176
+ * uploadOptions(path) -> the `options` object posted to /files/v3/files.
177
+ * overwrite:true (codex #4 — the legacy overwrite:false created a new
178
+ * duplicate file every push, so pull->push->pull never converged). Public so
179
+ * pages/blog can hotlink the result; EXACT_FOLDER scope so overwrite targets
180
+ * the same path deterministically.
181
+ */
182
+ export function uploadOptions() {
183
+ return {
184
+ access: 'PUBLIC_INDEXABLE',
185
+ overwrite: true,
186
+ duplicateValidationStrategy: 'NONE',
187
+ duplicateValidationScope: 'EXACT_FOLDER',
188
+ };
189
+ }
190
+
191
+ // fileName + folderPath the upload should target for a given asset path tail.
192
+ // A nested tail like `Stock%20images/Double%20exposure.jpeg` becomes
193
+ // folderPath=`/synced-assets/Stock%20images`, fileName=`Double%20exposure.jpeg`
194
+ // so overwrite-by-path stays stable.
195
+ export function uploadTarget(path) {
196
+ // DECODE each segment for File Manager: the @asset key keeps URL-encoding
197
+ // (`%20`) so the token/registry/on-disk path all match, but File Manager
198
+ // REJECTS `%` (and #?&;*^!$|) in folder/file names — so a nested key like
199
+ // `Google%20Drive%20Integration/x.jpg` must upload to folder
200
+ // `Google Drive Integration`. The hosted URL re-encodes the space, so resolve()
201
+ // still maps the encoded @asset token to the served URL.
202
+ const dec = (s) => {
203
+ try {
204
+ return decodeURIComponent(s);
205
+ } catch {
206
+ return s;
207
+ }
208
+ };
209
+ const segs = String(path).split('/').map(dec);
210
+ const fileName = segs.pop();
211
+ const sub = segs.join('/');
212
+ const folderPath = sub ? `${TARGET_FOLDER}/${sub}` : TARGET_FOLDER;
213
+ return { fileName, folderPath };
214
+ }
215
+
216
+ // ───────────────────────────────────────────────────────────────────────────
217
+ // .sync-state/<portalId>.rehosted.json — per-account, gitignored URL cache.
218
+ // Maps `<pathTail> -> hostedURL` for THIS account (source URLs after pull,
219
+ // target URLs after push). NOT committed.
220
+ // ───────────────────────────────────────────────────────────────────────────
221
+
222
+ function syncStateDir() {
223
+ const here = dirname(fileURLToPath(import.meta.url)); // sync/adapters
224
+ return pathResolve(here, '..', '..', '.sync-state');
225
+ }
226
+
227
+ function rehostedPath(portalId) {
228
+ return join(syncStateDir(), `${portalId}.rehosted.json`);
229
+ }
230
+
231
+ export function loadRehosted(portalId) {
232
+ const f = rehostedPath(portalId);
233
+ if (!existsSync(f)) return {};
234
+ try {
235
+ return JSON.parse(readFileSync(f, 'utf8'));
236
+ } catch {
237
+ return {};
238
+ }
239
+ }
240
+
241
+ export function saveRehosted(portalId, map) {
242
+ const dir = syncStateDir();
243
+ mkdirSync(dir, { recursive: true });
244
+ // Atomic write: serialize to a per-pid temp file then rename into place. A
245
+ // direct writeFileSync can be observed (or interrupted) mid-write — a crash or
246
+ // a concurrent reader between truncate and the final bytes would see a
247
+ // half-written / empty `{}` cache, which on the NEXT push silently means
248
+ // "nothing is rehosted" and re-uploads all 207 assets (the idempotency bug this
249
+ // adapter exists to prevent). rename(2) is atomic on the same filesystem, so the
250
+ // live cache file is always either the previous complete version or the new
251
+ // complete version — never an empty/truncated one.
252
+ const final = rehostedPath(portalId);
253
+ const tmp = `${final}.tmp-${process.pid}`;
254
+ writeFileSync(tmp, stableStringify(map));
255
+ renameSync(tmp, final);
256
+ }
257
+
258
+ // ───────────────────────────────────────────────────────────────────────────
259
+ // Scan the committed canonical tree for `@asset:` references.
260
+ // Sources: content/pages/*.json (+ *.widgets.json), content/blog/** *.json.
261
+ // theme/templates are ALSO @asset carriers, but the assets they reference are
262
+ // likewise tokenized; reading every *.json under contentDir covers pages+blog,
263
+ // and the optional `extraDirs` lets the orchestrator widen the scan.
264
+ // ───────────────────────────────────────────────────────────────────────────
265
+
266
+ function walkJson(dir, acc) {
267
+ if (!existsSync(dir)) return acc;
268
+ for (const ent of readdirSync(dir, { withFileTypes: true })) {
269
+ const full = join(dir, ent.name);
270
+ if (ent.isDirectory()) walkJson(full, acc);
271
+ else if (ent.isFile() && ent.name.endsWith('.json')) acc.push(full);
272
+ }
273
+ return acc;
274
+ }
275
+
276
+ /**
277
+ * collectReferencedAssetPaths(contentDir) -> string[] unique `<pathTail>`s
278
+ * referenced anywhere in the canonical content tree (pages + blog).
279
+ */
280
+ export function collectReferencedAssetPaths(contentDir) {
281
+ const files = [];
282
+ walkJson(join(contentDir, 'pages'), files);
283
+ walkJson(join(contentDir, 'landing-pages'), files);
284
+ walkJson(join(contentDir, 'blog'), files);
285
+ const paths = new Set();
286
+ for (const f of files) {
287
+ // skip our own state/manifest files if they ever live under content/
288
+ let text;
289
+ try {
290
+ text = readFileSync(f, 'utf8');
291
+ } catch {
292
+ continue;
293
+ }
294
+ for (const p of extractAssetPaths(text)) paths.add(p);
295
+ }
296
+ return [...paths];
297
+ }
298
+
299
+ // ───────────────────────────────────────────────────────────────────────────
300
+ // Source-URL resolution (PULL). Given a path tail, find a URL on the SOURCE
301
+ // account we can actually download. Order:
302
+ // 1. an existing .sync-state rehosted entry (already known this account),
303
+ // 2. File Manager search by file name (recovers dead legacy CDN URLs — the
304
+ // blog-sync.mjs fileManagerUrl trick),
305
+ // 3. reconstruct the canonical hubfs URL from the account's portal id.
306
+ // ───────────────────────────────────────────────────────────────────────────
307
+
308
+ async function fileManagerUrl(acct, path) {
309
+ // search by the bare file-name stem (matches blog-sync.mjs behaviour)
310
+ const name = decodeURIComponent(path.split('/').pop());
311
+ const stem = name.replace(/\.[^.]+$/, '');
312
+ const { ok, json } = await hub(
313
+ acct,
314
+ 'GET',
315
+ `/files/v3/files/search?name=${encodeURIComponent(stem)}&limit=5`,
316
+ );
317
+ if (!ok) return null;
318
+ const results = json.results || [];
319
+ const hit =
320
+ results.find((f) => `${f.name}.${f.extension}`.toLowerCase() === name.toLowerCase()) ||
321
+ results[0];
322
+ return hit?.url || null;
323
+ }
324
+
325
+ function reconstructHubfsUrl(portalId, path) {
326
+ // canonical legacy host; recovery via File Manager handles the dead ones.
327
+ return `https://cdn2.hubspot.net/hubfs/${portalId}/${path}`;
328
+ }
329
+
330
+ async function downloadBytes(url) {
331
+ const res = await fetch(encodeURI(url));
332
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
333
+ return Buffer.from(await res.arrayBuffer());
334
+ }
335
+
336
+ // ───────────────────────────────────────────────────────────────────────────
337
+ // Target upload (PUSH). Uploads bytes with OVERWRITE; returns the hosted URL.
338
+ // Network-injectable `doFetch` for unit testing the option payload.
339
+ // ───────────────────────────────────────────────────────────────────────────
340
+
341
+ export async function uploadAsset(acct, buf, path, doFetch = fetch) {
342
+ const { fileName, folderPath } = uploadTarget(path);
343
+ // Retry on transient throttling (429) / server errors (5xx). A bulk push of
344
+ // ~200 files reliably trips the Files API rate limit; without backoff a single
345
+ // transient 429 fails the whole push (assets hard-fails on failed>0). FormData
346
+ // is single-use, so rebuild it each attempt.
347
+ let res;
348
+ for (let attempt = 0; attempt < 5; attempt++) {
349
+ const form = new FormData();
350
+ form.append('file', new Blob([buf]), fileName);
351
+ form.append('fileName', fileName);
352
+ form.append('folderPath', folderPath);
353
+ form.append('options', JSON.stringify(uploadOptions()));
354
+ res = await doFetch(`${API}/files/v3/files`, {
355
+ method: 'POST',
356
+ headers: { Authorization: `Bearer ${acct.key}` },
357
+ body: form,
358
+ });
359
+ if (res.ok) {
360
+ const j = await res.json();
361
+ return j.url || j.objects?.[0]?.url || null;
362
+ }
363
+ if (res.status !== 429 && res.status < 500) break; // non-retryable client error
364
+ await new Promise((r) => setTimeout(r, 600 * 2 ** attempt)); // 0.6s,1.2s,2.4s,4.8s
365
+ }
366
+ const j = await res.json().catch(() => ({}));
367
+ throw new Error(`upload ${fileName} -> ${res.status}: ${j.message || ''}`);
368
+ }
369
+
370
+ // ───────────────────────────────────────────────────────────────────────────
371
+ // pull(acct, { contentDir, registry }) -> { pulled, notes }
372
+ // ───────────────────────────────────────────────────────────────────────────
373
+
374
+ export async function pull(acct, { contentDir, registry }) {
375
+ const notes = [];
376
+ const paths = collectReferencedAssetPaths(contentDir);
377
+ const rehosted = loadRehosted(acct.portalId);
378
+ let downloaded = 0;
379
+ let reused = 0;
380
+ let failed = 0;
381
+
382
+ for (const path of paths) {
383
+ // New downloads land in the unified content/assets/<path> tree; but bytes may
384
+ // already be committed in EITHER tree (the blog adapter commits its manifest
385
+ // assets under content/blog/assets/<path>), so an existing blog-manifest asset
386
+ // counts as already-committed and is never re-downloaded. (codex #6.)
387
+ const repoFile = assetRepoPath(contentDir, path);
388
+ const committedFile = resolveAssetBytesPath(contentDir, path);
389
+
390
+ // Resolve a downloadable source URL for this account.
391
+ let sourceUrl = rehosted[path] || null;
392
+ if (!sourceUrl) {
393
+ try {
394
+ sourceUrl = await fileManagerUrl(acct, path);
395
+ } catch {
396
+ sourceUrl = null;
397
+ }
398
+ }
399
+ if (!sourceUrl) sourceUrl = reconstructHubfsUrl(acct.portalId, path);
400
+
401
+ // Already have the bytes committed (in either tree) -> just (re)register the
402
+ // source URL; never re-download.
403
+ if (committedFile) {
404
+ reused++;
405
+ } else {
406
+ let buf = null;
407
+ try {
408
+ buf = await downloadBytes(sourceUrl);
409
+ } catch {
410
+ // last-ditch File Manager recovery for a dead reconstructed URL
411
+ try {
412
+ const alt = await fileManagerUrl(acct, path);
413
+ if (alt && alt !== sourceUrl) {
414
+ buf = await downloadBytes(alt);
415
+ sourceUrl = alt;
416
+ }
417
+ } catch {
418
+ /* fall through to failure */
419
+ }
420
+ }
421
+ if (!buf) {
422
+ failed++;
423
+ notes.push(`download failed: @asset:${path}`);
424
+ continue;
425
+ }
426
+ mkdirSync(dirname(repoFile), { recursive: true });
427
+ writeFileSync(repoFile, buf);
428
+ downloaded++;
429
+ }
430
+
431
+ // Record source URL -> @asset for this account (registry + state cache).
432
+ registry.assets[path] = sourceUrl;
433
+ rehosted[path] = sourceUrl;
434
+ }
435
+
436
+ saveRehosted(acct.portalId, rehosted);
437
+ notes.unshift(
438
+ `assets pull: ${paths.length} referenced | downloaded ${downloaded} | reused ${reused} | failed ${failed}`,
439
+ );
440
+ return { pulled: downloaded, notes };
441
+ }
442
+
443
+ // ───────────────────────────────────────────────────────────────────────────
444
+ // push(acct, { contentDir, registry }) -> { pushed, notes }
445
+ // ───────────────────────────────────────────────────────────────────────────
446
+
447
+ export async function push(acct, { contentDir, registry }) {
448
+ const notes = [];
449
+ const assetsDir = join(contentDir, 'assets');
450
+ // Union of referenced paths and bytes-on-disk: upload anything we have a file
451
+ // for, so content/blog/theme can resolve every @asset they reference.
452
+ const referenced = new Set(collectReferencedAssetPaths(contentDir));
453
+ const onDisk = new Set(listAssetFiles(assetsDir));
454
+ const paths = [...new Set([...referenced, ...onDisk])];
455
+
456
+ // The rehosted cache (.sync-state/<portal>.rehosted.json) is the per-account
457
+ // path -> hosted-URL map. It is the primary reuse source, but it is gitignored
458
+ // volatile state that can be lost, truncated, or never written. The per-account
459
+ // REGISTRY (registry.assets[path]) is the SAME mapping and is persisted by the
460
+ // orchestrator ATOMICALLY after every adapter — so it is the durable backstop.
461
+ // Seed the rehosted map from any target hosted URLs already in the registry so a
462
+ // missing/empty cache still yields REUSE (uploaded 0 | reused N) on a re-push
463
+ // instead of silently re-uploading all 207 assets. We only seed concrete http(s)
464
+ // URLs (a registry entry can also be `true` "known-but-url-built-by-caller",
465
+ // which is not a reusable hosted URL).
466
+ const rehosted = loadRehosted(acct.portalId);
467
+ for (const [k, v] of Object.entries(registry.assets || {})) {
468
+ if (rehosted[k] == null && typeof v === 'string' && /^https?:\/\//.test(v)) {
469
+ rehosted[k] = v;
470
+ }
471
+ }
472
+ let uploaded = 0;
473
+ let reused = 0;
474
+ let missing = 0;
475
+ let failed = 0;
476
+ // Referenced @asset tokens whose bytes are NOT committed. These are fatal:
477
+ // pushing past them would leave the content/blog/theme resolve() either
478
+ // hard-failing later (confusing) or — if a stale rehosted entry exists from a
479
+ // prior run — silently resolving to a DRIFTED url. We collect every one so the
480
+ // abort error names them all, then throw after the loop (data-loss guard).
481
+ const missingReferenced = [];
482
+
483
+ for (const path of paths) {
484
+ // Bytes may live in EITHER scheme's tree: content/assets/<path> (this
485
+ // adapter) or content/blog/assets/<path> (the blog manifest). We upload from
486
+ // wherever they are committed so a blog-manifest @asset referenced by a page
487
+ // (or vice-versa) resolves. (codex #6 unification.) The blog adapter ALSO
488
+ // rehosts its manifest assets, but overwrite-by-path makes a double upload
489
+ // idempotent, and finding bytes here keeps the assets-adapter scan from
490
+ // hard-failing on a blog-only @asset.
491
+ const repoFile = resolveAssetBytesPath(contentDir, path);
492
+ if (!repoFile) {
493
+ // referenced but bytes not committed in either tree — record so push can
494
+ // hard-fail below.
495
+ missing++;
496
+ notes.push(`missing bytes for @asset:${path} (run pull)`);
497
+ if (referenced.has(path)) missingReferenced.push(path);
498
+ continue;
499
+ }
500
+ // Already hosted on THIS account (cached from a prior pull/push) — reuse the
501
+ // URL instead of re-uploading. Re-uploading every referenced asset on each
502
+ // push is wasteful and trips the Files API rate limit on bulk runs; the
503
+ // bytes are byte-stable, so the cached URL is correct. (Set $ASSET_FORCE=1
504
+ // to force a re-upload.)
505
+ if (rehosted[path] && !process.env.ASSET_FORCE) {
506
+ registry.assets[path] = rehosted[path];
507
+ reused++;
508
+ continue;
509
+ }
510
+ let buf;
511
+ try {
512
+ buf = readFileSync(repoFile);
513
+ } catch (e) {
514
+ failed++;
515
+ notes.push(`read failed @asset:${path}: ${e.message}`);
516
+ continue;
517
+ }
518
+ let url;
519
+ try {
520
+ url = await uploadAsset(acct, buf, path);
521
+ } catch (e) {
522
+ failed++;
523
+ notes.push(`upload failed @asset:${path}: ${e.message}`);
524
+ continue;
525
+ }
526
+ // @asset -> target hosted URL, so resolve() in content/blog/theme works.
527
+ registry.assets[path] = url;
528
+ rehosted[path] = url;
529
+ uploaded++;
530
+ }
531
+
532
+ // Persist any URLs we DID resolve this run before aborting, so a re-run after
533
+ // the missing bytes are committed reuses them and stays idempotent.
534
+ saveRehosted(acct.portalId, rehosted);
535
+ notes.unshift(
536
+ `assets push: ${paths.length} asset(s) | uploaded ${uploaded} | reused ${reused} | missing-bytes ${missing} | failed ${failed}`,
537
+ );
538
+
539
+ // DATA-LOSS GUARD: a referenced @asset with no committed bytes aborts the
540
+ // whole push (the orchestrator's contract — throw to stop before a consumer
541
+ // resolves a missing/stale ref). Names every offender so the operator can
542
+ // `pull` once and re-push.
543
+ if (missingReferenced.length > 0) {
544
+ throw new Error(
545
+ `assets push: ${missingReferenced.length} referenced @asset(s) missing committed bytes — run \`pull\` first: ` +
546
+ missingReferenced.map((p) => `@asset:${p}`).join(', '),
547
+ );
548
+ }
549
+ // An upload that actually failed (network/API) is likewise fatal — don't let a
550
+ // consumer resolve a token we never uploaded.
551
+ if (failed > 0) {
552
+ throw new Error(`assets push: ${failed} asset upload(s) failed — see notes`);
553
+ }
554
+
555
+ return { pushed: uploaded, notes };
556
+ }
557
+
558
+ // List every committed asset's path tail (relative to content/assets), with
559
+ // '/' separators, so it matches the `@asset:<tail>` / registry key form.
560
+ export function listAssetFiles(assetsDir) {
561
+ if (!existsSync(assetsDir)) return [];
562
+ const out = [];
563
+ const walk = (dir, prefix) => {
564
+ for (const ent of readdirSync(dir, { withFileTypes: true })) {
565
+ if (ent.name === 'manifest.json') continue; // legacy sidecar, not an asset
566
+ const full = join(dir, ent.name);
567
+ const rel = prefix ? `${prefix}/${ent.name}` : ent.name;
568
+ if (ent.isDirectory()) walk(full, rel);
569
+ else if (ent.isFile() && statSync(full).size >= 0) out.push(rel);
570
+ }
571
+ };
572
+ walk(assetsDir, '');
573
+ return out;
574
+ }
575
+
576
+ export default { name, dependsOn, pull, push };