@artinstack/migrator 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,7 @@ See [docs/architecture.md](./docs/architecture.md) for the high-level blueprint:
11
11
  ```
12
12
  src/
13
13
  parsers/ WordPress, SmugMug, Squarespace, Wix → normalizer DTOs
14
+ wordpress/ WXR parse, builder flattening (theme registry)
14
15
  normalizer/ Canonical DTOs + portable idempotency types
15
16
  sinks/ filesystem export, MigrationSink interface
16
17
  cli/ artinstack-migrate
@@ -56,6 +57,8 @@ artinstack-migrate validate <platform> <export-file>
56
57
  | `--dry-run` | Parse and analyze only; no export files |
57
58
  | `--report <dir>` | With `--dry-run`, write `conflicts.json` and `migration-report.json` |
58
59
  | `--offline` | Skip network HEAD requests for asset size estimates |
60
+ | `--rewrite-gateway <url>` | WordPress: legacy API-gateway base (use with `--rewrite-public`) |
61
+ | `--rewrite-public <url>` | WordPress: public origin for `/wp-content/` asset paths |
59
62
  | `--sink filesystem` | Run through `MigrationSink` before writing (requires `--out`) |
60
63
  | `--urls <file>` | Wix only: URL list or `sitemap.xml` for static page snapshots |
61
64
 
@@ -68,6 +71,12 @@ artinstack-migrate wordpress export.xml --out ./output
68
71
  # Preview conflicts without writing content
69
72
  artinstack-migrate wordpress export.xml --dry-run --report ./preview/
70
73
 
74
+ # WordPress: rewrite legacy gateway URLs before dry-run / export (e.g. API Gateway → public CDN)
75
+ artinstack-migrate wordpress export.xml \
76
+ --rewrite-gateway "https://gateway.example/prod" \
77
+ --rewrite-public "https://www.example.com" \
78
+ --dry-run --report ./preview/
79
+
71
80
  # Validate export structure (JSON result on stdout, exit 0/1)
72
81
  artinstack-migrate validate wordpress export.xml
73
82
 
@@ -119,8 +128,10 @@ pnpm dev # watch build
119
128
  | Piece | `@artinstack/migrator` | Host application |
120
129
  |-------|------------------------|------------------|
121
130
  | Parsers + normalizer DTOs | Yes | No |
131
+ | WordPress builder flattening + origin URL rewrite (pre-DTO) | Yes | Optional same config on adapter input |
122
132
  | CLI + filesystem JSON export | Yes | No |
123
133
  | `MigrationSink` interface | Yes | Implementation |
134
+ | Dynamic shortcodes (`[portfolio]`, `[recent_posts]`), forms, sanitize | No | Yes |
124
135
  | Jobs, worker, credentials, UI | No | Yes |
125
136
 
126
137
  ## License
@@ -1,20 +1,226 @@
1
1
  import {
2
2
  SquarespaceCollectionClient,
3
+ WORDPRESS_BUILDER_REGISTRY,
3
4
  enumerateSquarespaceEntities,
4
5
  linkToPath,
5
6
  sanitizeSlug,
6
7
  summarizeSquarespaceExport,
7
8
  validateSquarespaceExportFile
8
- } from "./chunk-HH7666MQ.js";
9
+ } from "./chunk-XKWWXKP3.js";
9
10
  import {
10
- discoverContentAssetUrls
11
+ discoverContentAssetUrls,
12
+ normalizeAssetUrl
11
13
  } from "./chunk-2PNSVE5Y.js";
12
14
 
15
+ // src/lib/origin-url-rewrite.ts
16
+ function rewriteOriginUrlsInText(text, config) {
17
+ if (!text || config.rules.length === 0) return text;
18
+ let result = text;
19
+ for (const rule of config.rules) {
20
+ if (typeof rule.match === "string") {
21
+ if (!rule.match) continue;
22
+ result = result.split(rule.match).join(rule.replace);
23
+ continue;
24
+ }
25
+ result = result.replace(rule.match, rule.replace);
26
+ }
27
+ return result;
28
+ }
29
+ function createWpContentGatewayRewrite(gatewayBase, publicOrigin) {
30
+ const normalizedGateway = gatewayBase.replace(/\/$/, "");
31
+ const normalizedPublic = publicOrigin.replace(/\/$/, "");
32
+ return {
33
+ rules: [
34
+ {
35
+ match: `${normalizedGateway}/wp-content/`,
36
+ replace: `${normalizedPublic}/wp-content/`
37
+ }
38
+ ]
39
+ };
40
+ }
41
+
13
42
  // src/parsers/wordpress/parse-wxr.ts
14
43
  import { readFile } from "fs/promises";
15
44
  import { basename } from "path";
16
45
  import { XMLParser } from "fast-xml-parser";
46
+
47
+ // src/parsers/wordpress/builders/flatten.ts
48
+ function escapeRegExp(value) {
49
+ return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
50
+ }
51
+ function extractQuotedParam(params, name) {
52
+ const pattern = new RegExp(`\\b${escapeRegExp(name)}\\s*=\\s*`, "i");
53
+ const match = pattern.exec(params);
54
+ if (!match) return void 0;
55
+ let index = match.index + match[0].length;
56
+ while (index < params.length && /\s/.test(params[index])) index += 1;
57
+ const quote = params[index];
58
+ if (quote !== '"' && quote !== "'") return void 0;
59
+ index += 1;
60
+ let value = "";
61
+ while (index < params.length) {
62
+ const char = params[index];
63
+ if (char === "\\" && index + 1 < params.length) {
64
+ value += params[index + 1];
65
+ index += 2;
66
+ continue;
67
+ }
68
+ if (char === quote) break;
69
+ value += char;
70
+ index += 1;
71
+ }
72
+ const trimmed = value.trim();
73
+ return trimmed || void 0;
74
+ }
75
+ function extractShortcodeParam(params, names) {
76
+ for (const name of names) {
77
+ const value = extractQuotedParam(params, name);
78
+ if (value) return value;
79
+ }
80
+ return void 0;
81
+ }
82
+ function escapeHtmlText(text) {
83
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
84
+ }
85
+ function textToHtml(text, tag) {
86
+ const paragraphs = text.split(/\n{2,}/).map((part) => part.trim()).filter(Boolean);
87
+ if (paragraphs.length === 0) return "";
88
+ return paragraphs.map((paragraph) => {
89
+ const inner = escapeHtmlText(paragraph).replace(/\n/g, "<br />");
90
+ return `<${tag}>${inner}</${tag}>`;
91
+ }).join("\n");
92
+ }
93
+ function emitHtmlTag(tag, url) {
94
+ const normalized = normalizeAssetUrl(url) ?? url;
95
+ const escaped = normalized.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;");
96
+ switch (tag) {
97
+ case "img":
98
+ return `<img src="${escaped}" alt="" />`;
99
+ case "video":
100
+ return `<video src="${escaped}" controls></video>`;
101
+ case "iframe":
102
+ return `<iframe src="${escaped}" loading="lazy"></iframe>`;
103
+ }
104
+ }
105
+ function convertUrlRule(content, rule) {
106
+ const prefix = escapeRegExp(rule.shortcodePrefix);
107
+ const pattern = new RegExp(
108
+ `\\[${prefix}\\b([^\\]]*)\\]\\s*(?:\\[\\/${prefix}\\b[^\\]]*\\])?`,
109
+ "gi"
110
+ );
111
+ return content.replace(pattern, (block, params) => {
112
+ const url = extractShortcodeParam(params, rule.urlParams);
113
+ if (!url) return block;
114
+ return emitHtmlTag(rule.tag, url);
115
+ });
116
+ }
117
+ function convertTextRule(content, rule) {
118
+ const prefix = escapeRegExp(rule.shortcodePrefix);
119
+ const pattern = new RegExp(
120
+ `\\[${prefix}\\b([^\\]]*)\\]\\s*(?:\\[\\/${prefix}\\b[^\\]]*\\])?`,
121
+ "gis"
122
+ );
123
+ return content.replace(pattern, (block, params) => {
124
+ const parts = [];
125
+ for (const field of rule.fields) {
126
+ const text = extractQuotedParam(params, field.param);
127
+ if (!text) continue;
128
+ const html = textToHtml(text, field.tag);
129
+ if (html) parts.push(html);
130
+ }
131
+ return parts.length > 0 ? parts.join("\n") : block;
132
+ });
133
+ }
134
+ function convertWrapperRule(content, rule) {
135
+ const prefix = escapeRegExp(rule.shortcodePrefix);
136
+ const pattern = new RegExp(
137
+ `\\[${prefix}\\b([^\\]]*)\\]([\\s\\S]*?)\\[\\/${prefix}\\b[^\\]]*\\]`,
138
+ "gi"
139
+ );
140
+ return content.replace(pattern, (_, params, inner) => {
141
+ const parts = [];
142
+ if (rule.urlParams?.length) {
143
+ const url = extractShortcodeParam(params, rule.urlParams);
144
+ if (url) parts.push(emitHtmlTag("img", url));
145
+ }
146
+ parts.push(inner.trim());
147
+ return parts.filter(Boolean).join("\n");
148
+ });
149
+ }
150
+ function convertPlaceholderRule(content, rule) {
151
+ const prefix = escapeRegExp(rule.shortcodePrefix);
152
+ const pattern = new RegExp(
153
+ `\\[${prefix}\\b([^\\]]*)\\]\\s*(?:\\[\\/${prefix}\\b[^\\]]*\\])?`,
154
+ "gi"
155
+ );
156
+ return content.replace(pattern, rule.html);
157
+ }
158
+ function stripScaffoldingPrefix(content, prefix) {
159
+ const escaped = escapeRegExp(prefix);
160
+ const opener = new RegExp(`\\[${escaped}[a-z0-9_-]*[^\\]]*\\]`, "gi");
161
+ const closer = new RegExp(`\\[\\/${escaped}[a-z0-9_-]*[^\\]]*\\]`, "gi");
162
+ return content.replace(opener, "").replace(closer, "");
163
+ }
164
+ function stripLegacyTokens(content, tokens) {
165
+ let result = content;
166
+ for (const token of tokens) {
167
+ const escaped = escapeRegExp(token);
168
+ const opener = new RegExp(`\\[${escaped}\\b[^\\]]*\\]`, "gi");
169
+ const closer = new RegExp(`\\[\\/${escaped}\\b[^\\]]*\\]`, "gi");
170
+ result = result.replace(opener, "").replace(closer, "");
171
+ }
172
+ return result;
173
+ }
174
+ function detectThemes(content, registry) {
175
+ return registry.filter((theme) => theme.detect.test(content));
176
+ }
177
+ function flattenWordPressBuilders(content, options = {}) {
178
+ if (!content.trim()) {
179
+ return { html: content, detectedThemes: [] };
180
+ }
181
+ const registry = options.registry ?? WORDPRESS_BUILDER_REGISTRY;
182
+ const themes = detectThemes(content, registry);
183
+ if (themes.length === 0) {
184
+ return { html: content, detectedThemes: [] };
185
+ }
186
+ let html = content;
187
+ for (const theme of themes) {
188
+ for (const rule of theme.wrapperRules ?? []) {
189
+ html = convertWrapperRule(html, rule);
190
+ }
191
+ for (const rule of theme.textRules ?? []) {
192
+ html = convertTextRule(html, rule);
193
+ }
194
+ for (const rule of theme.urlRules ?? []) {
195
+ html = convertUrlRule(html, rule);
196
+ }
197
+ for (const rule of theme.placeholderRules ?? []) {
198
+ html = convertPlaceholderRule(html, rule);
199
+ }
200
+ for (const prefix of theme.scaffoldingPrefixes ?? []) {
201
+ html = stripScaffoldingPrefix(html, prefix);
202
+ }
203
+ if (theme.legacyScaffoldingTokens?.length) {
204
+ html = stripLegacyTokens(html, theme.legacyScaffoldingTokens);
205
+ }
206
+ }
207
+ html = html.replace(/\n{3,}/g, "\n\n").trim();
208
+ return {
209
+ html,
210
+ detectedThemes: themes.map((theme) => theme.id)
211
+ };
212
+ }
213
+
214
+ // src/parsers/wordpress/parse-wxr.ts
17
215
  var PLATFORM = "wordpress";
216
+ var WOOCOMMERCE_STUB_PAGE_SLUGS = /* @__PURE__ */ new Set(["cart", "checkout", "my-account"]);
217
+ var WOOCOMMERCE_STUB_SHORTCODE = /^\[woocommerce_(?:cart|checkout|my_account)\]\s*$/i;
218
+ function isWooCommerceStubPage(slug, contentHtml) {
219
+ if (WOOCOMMERCE_STUB_PAGE_SLUGS.has(slug)) return true;
220
+ const trimmed = contentHtml.trim();
221
+ if (!trimmed) return false;
222
+ return WOOCOMMERCE_STUB_SHORTCODE.test(trimmed);
223
+ }
18
224
  function asArray(value) {
19
225
  if (value === void 0) return [];
20
226
  return Array.isArray(value) ? value : [value];
@@ -80,13 +286,16 @@ function parseItems(xml) {
80
286
  const doc = parser.parse(xml);
81
287
  return asArray(doc.rss?.channel?.item);
82
288
  }
83
- function buildAttachmentIndex(items) {
289
+ function buildAttachmentIndex(items, originUrlRewrite) {
84
290
  const index = /* @__PURE__ */ new Map();
85
291
  for (const item of items) {
86
292
  if (textValue(item.post_type) !== "attachment") continue;
87
293
  const id = textValue(item.post_id);
88
- const url = textValue(item.attachment_url) || textValue(item.link);
294
+ let url = textValue(item.attachment_url) || textValue(item.link);
89
295
  if (!id || !url) continue;
296
+ if (originUrlRewrite) {
297
+ url = rewriteOriginUrlsInText(url, originUrlRewrite);
298
+ }
90
299
  const filename = basename(new URL(url, "http://local.invalid").pathname) || `attachment-${id}`;
91
300
  index.set(id, {
92
301
  sourceUrl: url,
@@ -171,10 +380,32 @@ function collectInlineAssets(html, attachmentIndex, seenUrls, exportedAt) {
171
380
  }
172
381
  return assets;
173
382
  }
383
+ function preprocessContent(rawHtml, options) {
384
+ let html = rawHtml;
385
+ if (options.originUrlRewrite) {
386
+ html = rewriteOriginUrlsInText(html, options.originUrlRewrite);
387
+ }
388
+ if (options.flattenBuilders !== false) {
389
+ html = flattenWordPressBuilders(html).html;
390
+ }
391
+ return html;
392
+ }
393
+ function resolveFeaturedAssetSourceId(thumbnailId, attachmentIndex, contentHtml) {
394
+ if (thumbnailId && attachmentIndex.has(thumbnailId)) {
395
+ return thumbnailId;
396
+ }
397
+ const firstInline = discoverContentAssetUrls(contentHtml)[0];
398
+ return firstInline ? `url:${firstInline}` : void 0;
399
+ }
400
+ function maybeRewriteUrl(url, config) {
401
+ if (!url) return void 0;
402
+ if (!config) return url;
403
+ return rewriteOriginUrlsInText(url, config);
404
+ }
174
405
  async function* enumerateWxrEntities(options) {
175
406
  const xml = await readFile(options.filePath, "utf8");
176
407
  const items = parseItems(xml);
177
- const attachmentIndex = buildAttachmentIndex(items);
408
+ const attachmentIndex = buildAttachmentIndex(items, options.originUrlRewrite);
178
409
  const { categories, tags } = collectTaxonomies(items);
179
410
  const seenAssetUrls = /* @__PURE__ */ new Set();
180
411
  const emittedAttachmentIds = /* @__PURE__ */ new Set();
@@ -201,11 +432,14 @@ async function* enumerateWxrEntities(options) {
201
432
  const postType = textValue(item.post_type);
202
433
  if (postType !== "post" && postType !== "page") continue;
203
434
  const id = textValue(item.post_id);
204
- const link = textValue(item.link);
435
+ const link = maybeRewriteUrl(textValue(item.link), options.originUrlRewrite);
205
436
  const slug = sanitizeSlug(textValue(item.post_name) || textValue(item.title) || id);
206
- const rawHtml = getContentEncoded(item);
437
+ const contentHtml = preprocessContent(getContentEncoded(item), options);
438
+ if (postType === "page" && options.skipWooCommerceStubPages !== false && isWooCommerceStubPage(slug, contentHtml)) {
439
+ continue;
440
+ }
207
441
  for (const asset of collectInlineAssets(
208
- rawHtml,
442
+ contentHtml,
209
443
  attachmentIndex,
210
444
  seenAssetUrls,
211
445
  options.exportedAt
@@ -223,10 +457,11 @@ async function* enumerateWxrEntities(options) {
223
457
  }
224
458
  if (postType === "post") {
225
459
  const thumbnailId = getPostMeta(item, "_thumbnail_id");
226
- let featuredAssetSourceId;
227
- if (thumbnailId && attachmentIndex.has(thumbnailId)) {
228
- featuredAssetSourceId = thumbnailId;
229
- }
460
+ const featuredAssetSourceId = resolveFeaturedAssetSourceId(
461
+ thumbnailId,
462
+ attachmentIndex,
463
+ contentHtml
464
+ );
230
465
  const post = {
231
466
  type: "post",
232
467
  source: sourceMeta(id, link, options.exportedAt),
@@ -234,7 +469,7 @@ async function* enumerateWxrEntities(options) {
234
469
  title: textValue(item.title) || slug,
235
470
  slug,
236
471
  excerpt: getExcerpt(item) || void 0,
237
- contentHtml: rawHtml,
472
+ contentHtml,
238
473
  publishedAt: textValue(item.post_date) || void 0,
239
474
  status: mapPublishStatus(textValue(item.status)),
240
475
  categorySlugs: categorySlugs.length ? categorySlugs : void 0,
@@ -251,7 +486,7 @@ async function* enumerateWxrEntities(options) {
251
486
  sourceId: id,
252
487
  title: textValue(item.title) || slug,
253
488
  slug,
254
- contentHtml: rawHtml,
489
+ contentHtml,
255
490
  isHomePage: isHomePage || void 0,
256
491
  status: mapPublishStatus(textValue(item.status))
257
492
  };
@@ -291,18 +526,28 @@ async function validateWxrFile(filePath) {
291
526
  }
292
527
 
293
528
  // src/parsers/wordpress/index.ts
294
- function resolvePath(input) {
295
- if (typeof input === "string") return input;
529
+ function resolveWxrOptions(input) {
530
+ if (typeof input === "string") {
531
+ return { filePath: input };
532
+ }
296
533
  if (input && typeof input === "object" && "path" in input) {
297
- return String(input.path);
534
+ const obj = input;
535
+ return {
536
+ filePath: String(obj.path),
537
+ originUrlRewrite: obj.originUrlRewrite,
538
+ flattenBuilders: obj.flattenBuilders,
539
+ skipWooCommerceStubPages: obj.skipWooCommerceStubPages
540
+ };
298
541
  }
299
- throw new Error("WordPress adapter requires input path (string or { path })");
542
+ throw new Error(
543
+ "WordPress adapter requires input path (string or { path, originUrlRewrite?, flattenBuilders?, skipWooCommerceStubPages? })"
544
+ );
300
545
  }
301
546
  var wordpressAdapter = {
302
547
  platform: "wordpress",
303
548
  async validateInput(input) {
304
- const path = resolvePath(input);
305
- const result = await validateWxrFile(path);
549
+ const { filePath } = resolveWxrOptions(input);
550
+ const result = await validateWxrFile(filePath);
306
551
  return {
307
552
  ok: result.ok,
308
553
  issues: result.issues,
@@ -310,8 +555,7 @@ var wordpressAdapter = {
310
555
  };
311
556
  },
312
557
  enumerateEntities(ctx) {
313
- const path = resolvePath(ctx.input);
314
- return enumerateWxrEntities({ filePath: path });
558
+ return enumerateWxrEntities(resolveWxrOptions(ctx.input));
315
559
  }
316
560
  };
317
561
 
@@ -2272,6 +2516,8 @@ function getAdapter(platform) {
2272
2516
  }
2273
2517
 
2274
2518
  export {
2519
+ rewriteOriginUrlsInText,
2520
+ createWpContentGatewayRewrite,
2275
2521
  wordpressAdapter,
2276
2522
  SMUGMUG_API_BASE,
2277
2523
  SMUGMUG_OAUTH_ENDPOINTS,
@@ -2287,4 +2533,4 @@ export {
2287
2533
  wixAdapter,
2288
2534
  getAdapter
2289
2535
  };
2290
- //# sourceMappingURL=chunk-VXEHAQKK.js.map
2536
+ //# sourceMappingURL=chunk-Q6M5JEL3.js.map