@nexpress/wp-import 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1155 @@
1
+ import { NpAuthUser } from '@nexpress/core';
2
+ import { Buffer as Buffer$1 } from 'node:buffer';
3
+
4
+ /**
5
+ * Phase 21.2 — Intermediate Record (IR) types.
6
+ *
7
+ * The IR is the seam between the WXR parser and the importer. The
8
+ * design doc (`docs/design/wordpress-import-design.md` §4) frames why this
9
+ * lives between parse and apply: keeps the parser free of NexPress
10
+ * concerns and lets future adapters (Ghost, Drupal, generic JSON)
11
+ * plug into the same applier.
12
+ *
13
+ * Field names mirror the WXR XML tag names where it doesn't hurt
14
+ * readability (`wpId` for `<wp:post_id>`, `wpType` for
15
+ * `<wp:post_type>`) so reviewers cross-checking against a real
16
+ * export can find the correspondence quickly.
17
+ */
18
+ type WpPostStatus = "publish" | "draft" | "private" | "pending" | "trash" | "auto-draft";
19
+ /** A category, post tag, or any custom WP taxonomy term attached to a post. */
20
+ interface WpTerm {
21
+ /** "category", "post_tag", or any custom taxonomy slug. */
22
+ taxonomy: string;
23
+ slug: string;
24
+ name: string;
25
+ }
26
+ /**
27
+ * A media reference parsed out of a post. Resolved later by the
28
+ * media pipeline (Phase 21.5) into actual np_media ids.
29
+ */
30
+ interface WpMediaRef {
31
+ /** Source URL on the WP site (e.g. `https://site.com/wp-content/uploads/.../foo.jpg`). */
32
+ sourceUrl: string;
33
+ /** WP attachment id when the reference points at one we know about. */
34
+ wpAttachmentId: number | null;
35
+ /**
36
+ * Where this reference came from in the original document — drives
37
+ * how the applier wires the result. `featured` lands on the post's
38
+ * `coverImage` field; `inline` rewrites the body content.
39
+ */
40
+ kind: "featured" | "inline";
41
+ }
42
+ interface WpComment {
43
+ wpId: number;
44
+ /** Parent comment id when the comment is a reply, else null. */
45
+ parentWpId: number | null;
46
+ authorName: string;
47
+ authorEmail: string | null;
48
+ authorUrl: string | null;
49
+ /** ISO timestamp from <wp:comment_date_gmt>. */
50
+ date: string;
51
+ /** Comment body — usually plain text but can contain HTML. */
52
+ content: string;
53
+ /** Maps from <wp:comment_approved> ("1" → true, anything else → false). */
54
+ approved: boolean;
55
+ }
56
+ /**
57
+ * One post / page / custom post type record in the WXR. The applier
58
+ * walks an array of these to write content into NexPress collections.
59
+ */
60
+ interface WpImportRecord {
61
+ /** Numeric id from <wp:post_id>. Stable across re-exports of the same WP site. */
62
+ wpId: number;
63
+ /** "post" | "page" | custom post type slug. Drives applier collection routing. */
64
+ wpType: string;
65
+ status: WpPostStatus;
66
+ slug: string;
67
+ title: string;
68
+ /** From <excerpt:encoded>. Null when WP didn't write one. */
69
+ excerpt: string | null;
70
+ /**
71
+ * Raw HTML / Gutenberg content from <content:encoded>. Phase 21.4
72
+ * runs this through the HTML→Lexical converter; this PR keeps it
73
+ * as the unmodified string so parser tests stay deterministic.
74
+ */
75
+ rawContent: string;
76
+ /** WP author id from <dc:creator> (resolved against parsed authors). */
77
+ wpAuthorLogin: string;
78
+ /** ISO timestamp from <wp:post_date_gmt>. */
79
+ publishedAt: string;
80
+ /** ISO from <wp:post_modified_gmt>. */
81
+ updatedAt: string;
82
+ terms: WpTerm[];
83
+ /** Resolved <wp:postmeta> entries. Keys preserved verbatim. */
84
+ meta: Record<string, string>;
85
+ mediaRefs: WpMediaRef[];
86
+ comments: WpComment[];
87
+ }
88
+ interface WpAuthor {
89
+ wpId: number;
90
+ /** WP login slug — e.g. "alice". */
91
+ login: string;
92
+ email: string;
93
+ displayName: string;
94
+ /** Free-form bio / description from <wp:author_description>. */
95
+ description: string | null;
96
+ }
97
+ /** Site-level metadata harvested from the <channel> envelope. */
98
+ interface WpSiteInfo {
99
+ /** <title>. */
100
+ title: string;
101
+ /** <link>. */
102
+ link: string;
103
+ /** <description>. */
104
+ description: string;
105
+ /** <wp:base_site_url>. */
106
+ baseSiteUrl: string;
107
+ /** <wp:base_blog_url>. Often the same as baseSiteUrl on single-site WP. */
108
+ baseBlogUrl: string;
109
+ /** <language>, e.g. "en-US". */
110
+ language: string | null;
111
+ }
112
+ /**
113
+ * The full output of `parseWxr()`. Captures everything an applier
114
+ * needs in one in-memory shape.
115
+ */
116
+ interface WpImportBundle {
117
+ site: WpSiteInfo;
118
+ authors: WpAuthor[];
119
+ /**
120
+ * All records in document order. Includes `attachment` post types
121
+ * (used to resolve media refs) — the applier filters those out
122
+ * after the media pipeline runs.
123
+ */
124
+ records: WpImportRecord[];
125
+ /**
126
+ * Standalone <wp:category> / <wp:tag> / <wp:term> entries from the
127
+ * channel envelope. Most WP exports duplicate these into per-post
128
+ * <category> elements too; the applier de-dupes.
129
+ */
130
+ terms: WpTerm[];
131
+ }
132
+
133
+ /**
134
+ * Phase 21.2 — WXR parser. Reads a WordPress eXtended RSS export
135
+ * (the file produced by Tools → Export in wp-admin) and produces
136
+ * an in-memory bundle of typed Intermediate Records.
137
+ *
138
+ * The parser is deliberately tolerant:
139
+ *
140
+ * - Missing namespace-prefixed tags (`wp:`, `dc:`, `content:`,
141
+ * `excerpt:`) fall back to sensible defaults rather than
142
+ * throwing, because real-world WXR files vary in how strictly
143
+ * they're written by WP plugins.
144
+ * - <category> elements appear in two shapes (channel-level
145
+ * <wp:category> and per-post <category>); we handle both.
146
+ * - The CDATA wrapping around <content:encoded> is unwrapped
147
+ * transparently by `fast-xml-parser`.
148
+ *
149
+ * NOT in this PR: streaming parse for huge exports (Phase 21.10),
150
+ * media downloading (21.5), HTML-to-Lexical conversion (21.4).
151
+ */
152
+ declare function parseWxr(xml: string): WpImportBundle;
153
+
154
+ /**
155
+ * Phase 21.16 — streaming WXR parse for huge exports.
156
+ *
157
+ * The eager `parseWxr(string)` path stays as the canonical small-
158
+ * export tool: it loads the whole file into memory which is fine
159
+ * up to a few hundred megabytes. For 5 GB+ WXR exports (large WP
160
+ * sites with embedded media), eager parsing OOMs the process, so
161
+ * this module provides an async-iterable equivalent that:
162
+ *
163
+ * 1. Reads the file in chunks via `createReadStream`.
164
+ * 2. Loads the channel header (site / authors / channel-level
165
+ * terms) once it's fully buffered. The header is small —
166
+ * WP writes it before the first `<item>` and it's bounded by
167
+ * the author count, not the post count.
168
+ * 3. Scans the rolling buffer for `<item>...</item>` slices and
169
+ * hands each to the existing `parseWxr` machinery wrapped in
170
+ * a tiny `<channel>` envelope so the parser still recognises
171
+ * the structure.
172
+ * 4. Yields one `WpImportRecord` per item; peak memory stays
173
+ * bounded by the largest single item rather than the whole
174
+ * file.
175
+ *
176
+ * Limitations:
177
+ *
178
+ * - Items larger than the buffer threshold (default 64 MB) are
179
+ * surfaced as an explicit error rather than silently growing
180
+ * the buffer; that's the early-warning signal that a record's
181
+ * `<content:encoded>` is degenerate (a 60 MB embedded base64
182
+ * image, for example) and needs the operator to intervene.
183
+ * - This is a structural streamer only — the applier still
184
+ * consumes the records eagerly today. The Phase 21.16 cut
185
+ * gives operators the primitive; a future sub-phase wires it
186
+ * end-to-end through `applyBundle` so peak memory stays low
187
+ * for the full pipeline too.
188
+ */
189
+ interface WpImportStreamHeader {
190
+ site: WpSiteInfo;
191
+ authors: WpAuthor[];
192
+ /** Channel-level <wp:category> / <wp:tag> / <wp:term> entries. */
193
+ terms: WpTerm[];
194
+ }
195
+ interface WpImportStream {
196
+ header: WpImportStreamHeader;
197
+ items: AsyncIterable<WpImportRecord>;
198
+ }
199
+ interface WpImportStreamOptions {
200
+ /** Read-stream chunk size in bytes. Default 64 KB. */
201
+ highWaterMark?: number;
202
+ /**
203
+ * Hard cap on the rolling buffer between item boundaries — when
204
+ * a single `<item>...</item>` slice exceeds this we abort
205
+ * rather than growing memory unbounded. Default 64 MB.
206
+ */
207
+ maxItemBytes?: number;
208
+ }
209
+ declare class WpImportStreamError extends Error {
210
+ constructor(message: string);
211
+ }
212
+ /**
213
+ * Open a streaming reader over a WXR file. Resolves with the
214
+ * channel header + an async iterator of items.
215
+ */
216
+ declare function parseWxrStream(path: string, options?: WpImportStreamOptions): Promise<WpImportStream>;
217
+
218
+ /**
219
+ * Phase 21.4 — HTML → Lexical AST.
220
+ *
221
+ * Resolves design doc §11.1: roll our own minimal converter rather
222
+ * than wrap a heavier ecosystem library. Rationale:
223
+ *
224
+ * - The set of WP HTML constructs we care about is small —
225
+ * paragraphs, headings, lists, blockquote, basic inline
226
+ * formatting, links, images, line breaks, hr, code.
227
+ * - `node-html-parser` gives us a lenient HTML5 parse with no
228
+ * DOM dependency (works in Node and edge runtimes).
229
+ * - The Lexical node shape is well-documented in the framework's
230
+ * own renderer (`packages/editor/src/render-rich-text.tsx`),
231
+ * so we emit exactly what the renderer expects.
232
+ *
233
+ * NOT in this PR:
234
+ * - Gutenberg block-comment syntax (`<!-- wp:paragraph -->`).
235
+ * Treat the comments as text noise; Phase 21.4b can layer
236
+ * block awareness on top once we have a real WP fixture to
237
+ * validate against.
238
+ * - Image media-id resolution. <img> nodes are emitted with
239
+ * the source URL as `src`; Phase 21.5 swaps these to
240
+ * NexPress media ids after the upload pipeline runs.
241
+ * - Custom shortcodes (`[gallery]`, etc.). Out of scope per
242
+ * design doc §2 — handled by the long-tail conversion pass.
243
+ */
244
+ interface LexicalRoot {
245
+ root: {
246
+ type: "root";
247
+ direction: null;
248
+ format: "";
249
+ indent: 0;
250
+ version: 1;
251
+ children: LexicalBlock[];
252
+ };
253
+ }
254
+ interface LexicalBlock {
255
+ type: string;
256
+ version: 1;
257
+ format: "" | number;
258
+ indent: number;
259
+ direction: null;
260
+ children?: LexicalNode[];
261
+ tag?: string;
262
+ listType?: "bullet" | "number";
263
+ url?: string;
264
+ src?: string;
265
+ altText?: string;
266
+ text?: string;
267
+ }
268
+ type LexicalNode = LexicalBlock;
269
+ /**
270
+ * Convert raw WP content HTML into a Lexical root document.
271
+ * Always returns a valid Lexical structure — empty input becomes a
272
+ * single empty paragraph, matching what the editor would produce
273
+ * for a freshly-created field.
274
+ */
275
+ declare function htmlToLexical(html: string): LexicalRoot;
276
+
277
+ /**
278
+ * Phase 21.4 — index attachment records by their WP id so the
279
+ * applier can resolve `_thumbnail_id` references and inline
280
+ * `wp-image-N` ids to source URLs.
281
+ *
282
+ * Phase 21.5 will replace these source URLs with NexPress media
283
+ * ids after the actual download/upload pipeline runs. For now
284
+ * the applier just preserves the URL on the document so the
285
+ * post body and featured-image references aren't lost.
286
+ */
287
+ interface AttachmentEntry {
288
+ wpAttachmentId: number;
289
+ sourceUrl: string;
290
+ /** WP attachment post-meta — `_wp_attached_file`, dimensions, etc. */
291
+ meta: Record<string, string>;
292
+ /** Original attachment record's title (filename in most exports). */
293
+ title: string;
294
+ }
295
+ interface AttachmentIndex {
296
+ /** Look up by numeric WP attachment id. */
297
+ byId: ReadonlyMap<number, AttachmentEntry>;
298
+ /** Look up by source URL — useful for inline img refs that didn't carry an id. */
299
+ byUrl: ReadonlyMap<string, AttachmentEntry>;
300
+ }
301
+ declare function buildAttachmentIndex(bundle: WpImportBundle): AttachmentIndex;
302
+
303
+ /**
304
+ * Phase 21.5 — orchestrates download + upload of every media URL the
305
+ * import touches.
306
+ *
307
+ * Inputs are wired through `MediaPipelineDeps` rather than imported
308
+ * directly so the unit tests don't need a DB or a network. The CLI
309
+ * shim under `apps/web/scripts/wp-import.ts` plugs in the real
310
+ * `uploadMedia` from `@nexpress/core`; tests pass an in-memory stub.
311
+ *
312
+ * Resolution map:
313
+ *
314
+ * - `byUrl` — every successfully uploaded source URL → NexPress
315
+ * media id. Used to rewrite inline `<img>` src in Lexical.
316
+ * - `byAttachmentId` — WP attachment id → media id. Used by the
317
+ * applier for `_thumbnail_id` featured-image lookup.
318
+ *
319
+ * Phase 21.13 layered two prod-grade concerns on top:
320
+ *
321
+ * - **Per-host concurrency cap.** Each unique URL host gets a
322
+ * small fixed-size queue (default 4) so a 50-image post on the
323
+ * same wp-content domain doesn't open 50 sockets in parallel.
324
+ * URLs from different hosts run independently.
325
+ * - **Cross-run hash dedup.** When the caller supplies
326
+ * `findExistingByHash`, the pipeline computes the SHA-256 of
327
+ * the downloaded bytes and looks the row up by hash before
328
+ * uploading. Re-running the importer against the same WXR
329
+ * therefore reuses existing `np_media` rows instead of
330
+ * producing byte-identical duplicates.
331
+ */
332
+ interface MediaUploadInput {
333
+ buffer: Buffer;
334
+ originalFilename: string;
335
+ mimeType: string;
336
+ }
337
+ interface MediaPipelineDeps {
338
+ /** Fetch the bytes for a source URL. Defaults to `downloadMedia`. */
339
+ download?: (url: string) => Promise<{
340
+ buffer: Buffer;
341
+ mimeType: string;
342
+ filename: string;
343
+ }>;
344
+ /**
345
+ * Push a downloaded blob through the framework's media service.
346
+ * Returns the new media row id.
347
+ */
348
+ upload: (input: MediaUploadInput) => Promise<{
349
+ id: string;
350
+ }>;
351
+ /**
352
+ * Phase 21.13 — when supplied, the pipeline asks the caller to
353
+ * look up an existing `np_media` row by SHA-256 hash before
354
+ * uploading. Returning a row reuses it (cross-run idempotency);
355
+ * returning null falls through to upload as normal.
356
+ */
357
+ findExistingByHash?: (sha256: string) => Promise<{
358
+ id: string;
359
+ } | null>;
360
+ }
361
+ interface MediaResolution {
362
+ byUrl: Map<string, string>;
363
+ byAttachmentId: Map<number, string>;
364
+ }
365
+ interface MediaPipelineError {
366
+ url: string;
367
+ reason: string;
368
+ }
369
+ interface MediaPipelineReport {
370
+ resolution: MediaResolution;
371
+ uploaded: number;
372
+ skipped: number;
373
+ /** Phase 21.13 — count of URLs whose bytes matched an existing media row. */
374
+ reused: number;
375
+ errors: MediaPipelineError[];
376
+ }
377
+ interface MediaPipelineOptions {
378
+ /** Set true to walk URLs without downloading or uploading anything. */
379
+ dryRun?: boolean;
380
+ /** Optional progress sink. */
381
+ log?: (line: string) => void;
382
+ /**
383
+ * Phase 21.13 — concurrent-download cap per source-URL host.
384
+ * Default 4 mirrors the design doc §6 recommendation; tests pass
385
+ * 1 to keep ordering deterministic.
386
+ */
387
+ perHostConcurrency?: number;
388
+ }
389
+ /**
390
+ * Walk every record's media refs + the attachment index, fetch each
391
+ * unique URL once, and stamp the new media id back into the
392
+ * resolution map.
393
+ *
394
+ * The pipeline is best-effort: a single 404 doesn't abort the run.
395
+ * The error list surfaces what was missed so the operator can chase
396
+ * it up. Callers (the applier) treat unresolved URLs as "leave the
397
+ * Lexical src as-is and render a broken link" — same as design §6.
398
+ */
399
+ declare function runMediaPipeline(bundle: WpImportBundle, attachments: AttachmentIndex, deps: MediaPipelineDeps, options?: MediaPipelineOptions): Promise<MediaPipelineReport>;
400
+
401
+ /**
402
+ * Phase 21.8 — wire WP authors to NexPress staff users.
403
+ *
404
+ * The applier hands a callback the parsed author triple
405
+ * `(login, email, displayName)` and expects either a NexPress user
406
+ * id or `null` (skip — post lands without an author and is
407
+ * attributed to the import actor by default).
408
+ *
409
+ * Default semantics on the shim side (design §7):
410
+ *
411
+ * - Find a user by `login` (matched against the `name` column or
412
+ * a stored handle); if missing, create one with role `viewer`
413
+ * and an email flagged so it doesn't collide with a real
414
+ * account (`<original>+wp-import@<domain>`).
415
+ * - The opt-out branch (CLI `--no-create-authors`) returns null
416
+ * for every author, so posts get the import actor.
417
+ */
418
+ interface AuthorResolveInput {
419
+ /** From <dc:creator> on the WP post. */
420
+ wpAuthorLogin: string;
421
+ /** The matching WXR <wp:author> entry, when one exists. May be undefined. */
422
+ wpAuthor: WpAuthor | undefined;
423
+ }
424
+ interface AuthorResolver {
425
+ resolveAuthor: (input: AuthorResolveInput) => Promise<{
426
+ id: string;
427
+ } | null>;
428
+ }
429
+ interface AuthorResolution {
430
+ /** WP login → NexPress user id. */
431
+ authorIds: Map<string, string>;
432
+ /** Logins the resolver explicitly skipped (returned null). */
433
+ skipped: string[];
434
+ /** Logins where the resolver threw. */
435
+ errors: Array<{
436
+ login: string;
437
+ reason: string;
438
+ }>;
439
+ }
440
+ /**
441
+ * Resolve every unique WP author login that appears on a non-
442
+ * attachment record once. Returns the lookup the applier uses when
443
+ * stamping `data.author` per record.
444
+ */
445
+ declare function resolveAuthors(bundle: WpImportBundle, resolver: AuthorResolver): Promise<AuthorResolution>;
446
+
447
+ /**
448
+ * Phase 21.14 — resume marker.
449
+ *
450
+ * The applier writes the ids of every successfully imported entity
451
+ * to a sidecar JSON file next to the WXR. On re-run the file is
452
+ * read first, and the applier:
453
+ *
454
+ * - Skips documents whose `(collection, slug)` is already in the
455
+ * marker — the import respects the previous run's mapping
456
+ * instead of re-querying the DB.
457
+ * - Skips comments whose `wpCommentId` is already in the marker
458
+ * — closes the design-§9 idempotency gap (re-runs no longer
459
+ * create duplicate `np_comments` rows).
460
+ * - Reuses media-id mappings so a partial-failure mid-pipeline
461
+ * doesn't have to re-download the bytes that already landed.
462
+ *
463
+ * The marker is opt-in: callers without a `resume` deps run the
464
+ * historical "skip on slug" behavior. Operators who want crash-
465
+ * recovery pass `--resume` to the CLI.
466
+ *
467
+ * Schema is versioned so future shape changes can migrate
468
+ * forward; today's shape is `version: 1`.
469
+ */
470
+ interface ResumeState {
471
+ version: 1;
472
+ /** WXR path the marker was first written for — sanity check. */
473
+ source: string;
474
+ startedAt: string;
475
+ updatedAt: string;
476
+ /** `${collection}/${slug}` → NexPress doc id. */
477
+ documents: Record<string, string>;
478
+ /** WP comment id (numeric) → NexPress np_comments.id. */
479
+ comments: Record<number, string>;
480
+ /** WP author login → NexPress np_users.id. */
481
+ authors: Record<string, string>;
482
+ /** WP attachment URL → NexPress np_media.id. */
483
+ media: Record<string, string>;
484
+ /** `${taxonomy}:${slug}` → NexPress taxonomy term id. */
485
+ taxonomies: Record<string, string>;
486
+ }
487
+ interface ResumeDeps {
488
+ state: ResumeState;
489
+ /**
490
+ * Persist the current state to disk. Called after each record-
491
+ * level success so a crash mid-import doesn't lose work. Errors
492
+ * are surfaced — a marker that can't be written defeats the
493
+ * whole point.
494
+ */
495
+ persist: () => void;
496
+ }
497
+ declare function emptyResumeState(source: string): ResumeState;
498
+ declare class ResumeStateError extends Error {
499
+ constructor(message: string);
500
+ }
501
+ /**
502
+ * Load a resume state file from disk. Missing file → fresh state.
503
+ * Malformed file → throw so the operator can decide whether to
504
+ * delete the marker or fix it (deletion silently in the importer
505
+ * would erase the prior run's progress, which is the wrong default).
506
+ */
507
+ declare function loadResumeState(path: string, source: string): ResumeState;
508
+ declare function persistResumeState(path: string, state: ResumeState): void;
509
+ declare function documentKey(collection: string, slug: string): string;
510
+
511
+ /**
512
+ * Phase 21.7 — wire WP comments into NexPress's `np_comments` plus
513
+ * per-author imported members.
514
+ *
515
+ * The applier owns sequencing — it inserts each post first, then
516
+ * walks the post's comments and asks the caller's hooks to:
517
+ *
518
+ * 1. Resolve a comment author to a NexPress member id, creating
519
+ * one with `status='imported'` if needed.
520
+ * 2. Insert the comment row directly (no spam/profanity check,
521
+ * no notification fan-out — it's archived content).
522
+ *
523
+ * Comments that aren't approved in the WXR (`<wp:comment_approved>`
524
+ * != "1") are dropped. Reply parents are resolved within the same
525
+ * post: WP comment ids form a per-post tree, so we walk the comment
526
+ * list in id-ascending order and consult an in-memory map.
527
+ */
528
+ interface ImportedMemberInput {
529
+ handle: string;
530
+ email: string | null;
531
+ displayName: string;
532
+ }
533
+ interface CommentInsertInput {
534
+ targetType: string;
535
+ targetId: string;
536
+ parentId: string | null;
537
+ memberId: string;
538
+ bodyMd: string;
539
+ bodyHtml: string;
540
+ createdAt: Date;
541
+ }
542
+ interface CommentDeps {
543
+ ensureImportedMember: (input: ImportedMemberInput) => Promise<{
544
+ id: string;
545
+ }>;
546
+ insertComment: (input: CommentInsertInput) => Promise<{
547
+ id: string;
548
+ }>;
549
+ /**
550
+ * Render a comment body to safe HTML. The shim plugs in the
551
+ * framework's `renderCommentMarkdown`; tests pass a passthrough.
552
+ */
553
+ renderBody: (source: string) => string;
554
+ }
555
+ interface CommentImportPlan {
556
+ applied: number;
557
+ skippedUnapproved: number;
558
+ skippedNoMember: number;
559
+ /** Phase 21.14 — comments the resume marker said were already imported. */
560
+ skippedByResume: number;
561
+ errors: Array<{
562
+ wpCommentId: number;
563
+ reason: string;
564
+ }>;
565
+ }
566
+ /**
567
+ * Walk a record's comments and import them. Mutates `plan` in place.
568
+ * Returns when the record's comments have all been processed.
569
+ */
570
+ declare function importPostComments(args: {
571
+ record: WpImportRecord;
572
+ postId: string;
573
+ collection: string;
574
+ deps: CommentDeps;
575
+ plan: CommentImportPlan;
576
+ log?: (line: string) => void;
577
+ /** Phase 21.14 — when supplied, dedupes by `wpCommentId`. */
578
+ resume?: ResumeDeps;
579
+ }): Promise<void>;
580
+ declare function emptyCommentPlan(): CommentImportPlan;
581
+
582
+ /**
583
+ * Phase 21.6 — wire WP `<category>` and `<post_tag>` term references
584
+ * onto NexPress posts via a caller-supplied resolver.
585
+ *
586
+ * The resolver decides where terms physically live. The reference
587
+ * app's shim points it at the `taxonomies` collection (one row per
588
+ * unique `(taxonomy, slug)` pair); user projects with their own
589
+ * taxonomy storage swap in a different resolver. The wp-import
590
+ * package never reaches into a collection by name.
591
+ *
592
+ * Behavior:
593
+ *
594
+ * - The applier collects every term across the whole bundle, runs
595
+ * each through the resolver once, then maps back per-post.
596
+ * - A resolver that throws or returns `null` for a given term is
597
+ * captured as an error/note in the report; the post still
598
+ * imports without that term wired.
599
+ * - Without a resolver the applier records a single notes line
600
+ * (one per import) — no per-record noise — and posts go in
601
+ * without `categories` / `tags` set.
602
+ */
603
+ interface TaxonomyKey {
604
+ taxonomy: string;
605
+ slug: string;
606
+ name: string;
607
+ }
608
+ interface TaxonomyResolver {
609
+ /**
610
+ * Look up the taxonomy term row by `(taxonomy, slug)`, creating
611
+ * one if it doesn't exist. Returns the row's NexPress id, or
612
+ * `null` if the resolver decided to skip this term (e.g. the
613
+ * project doesn't track that taxonomy).
614
+ */
615
+ findOrCreate: (input: TaxonomyKey) => Promise<{
616
+ id: string;
617
+ } | null>;
618
+ }
619
+ interface TaxonomyResolution {
620
+ /** `taxonomy:slug` → NexPress term id. */
621
+ termIds: Map<string, string>;
622
+ /** Resolver failures. */
623
+ errors: Array<{
624
+ key: TaxonomyKey;
625
+ reason: string;
626
+ }>;
627
+ /** Terms the resolver explicitly skipped (returned null). */
628
+ skipped: TaxonomyKey[];
629
+ }
630
+ declare function termCacheKey(taxonomy: string, slug: string): string;
631
+ /**
632
+ * Walk every record's terms and the channel-level term list,
633
+ * collapse them to a unique set, and resolve each through the
634
+ * caller's resolver. Returns a lookup the applier can use when
635
+ * building per-record `categories` / `tags` field values.
636
+ */
637
+ declare function resolveTaxonomies(records: WpImportRecord[], channelTerms: WpTerm[], resolver: TaxonomyResolver): Promise<TaxonomyResolution>;
638
+ /**
639
+ * Per-record helper — returns the resolved `(categoryIds, tagIds)`
640
+ * pair for a record. Anything outside the two built-in WP taxonomies
641
+ * is dropped; the applier mirrors WP's own admin which only renders
642
+ * `category` and `post_tag` on the post edit screen.
643
+ */
644
+ declare function pickPostTermIds(record: WpImportRecord, resolution: TaxonomyResolution): {
645
+ categoryIds: string[];
646
+ tagIds: string[];
647
+ };
648
+
649
+ /**
650
+ * Phase 21.4 — write posts and pages from a parsed bundle into
651
+ * NexPress collections via the framework's `saveDocument` pipeline.
652
+ *
653
+ * Phase 21.5 layer: when a `media` deps object is supplied, the
654
+ * applier first walks every attachment + inline `<img>` URL through
655
+ * the media download/upload pipeline. The resulting URL → media-id
656
+ * map is then stitched into Lexical body content and the post's
657
+ * `coverImage` upload field. Without `media.deps` the applier still
658
+ * runs end-to-end, but image refs render as raw source-URL `<img>`
659
+ * tags (the pre-21.5 behavior).
660
+ *
661
+ * Phase 21.6 layer: when a `taxonomies` resolver is supplied, the
662
+ * applier resolves every WP `<category>` / `<post_tag>` term once
663
+ * through the resolver and stamps the resulting NexPress term ids
664
+ * onto each post's `categories` / `tags` relationship fields. The
665
+ * reference app's shim points the resolver at a `taxonomies`
666
+ * collection; user projects with their own taxonomy storage swap
667
+ * in their own resolver. Without a resolver the applier records a
668
+ * single notes line and posts go in without their terms wired.
669
+ *
670
+ * Phase 21.7 layer: when a `comments` deps object is supplied, the
671
+ * applier walks each freshly created post's comments, find-or-
672
+ * creates an `imported` member per author, and inserts each comment
673
+ * directly into `np_comments` via the deps. Spam/profanity adapters
674
+ * and notification fan-out are bypassed — this is archived content,
675
+ * not new community activity.
676
+ *
677
+ * Phase 21.8 layer: when an `authors` resolver is supplied, every
678
+ * unique `<dc:creator>` login is resolved once into a NexPress
679
+ * user id and stamped onto the post's `author` relationship field.
680
+ * The shim's default resolver creates a `role: "viewer"` user with
681
+ * a flagged email so the operator can promote them after the
682
+ * import. The `--no-create-authors` opt-out swaps in a resolver
683
+ * that returns `null` for every login — posts then go in without
684
+ * an author and the import actor takes the credit via
685
+ * `createdBy` / `updatedBy`.
686
+ *
687
+ * Out of scope here: custom post types (21.9). Records of those
688
+ * types are still skipped with a one-line note in the report.
689
+ */
690
+ interface ApplyOptions {
691
+ /** Staff user that the import is attributed to. Required by `saveDocument`. */
692
+ actor: NpAuthUser;
693
+ /** Set true for a dry run — counts what would happen without writing. */
694
+ dryRun?: boolean;
695
+ /** Optional sink for per-record progress messages. Defaults to no-op. */
696
+ log?: (line: string) => void;
697
+ /**
698
+ * When supplied, runs the Phase 21.5 media pipeline before writing
699
+ * any record. Omit to skip media handling entirely (the applier
700
+ * leaves Lexical image src as the original WP URL).
701
+ */
702
+ media?: MediaPipelineDeps;
703
+ /**
704
+ * Phase 21.6 — when supplied, the applier resolves every WP
705
+ * category/tag through this hook and stamps the resulting ids
706
+ * onto post `categories` / `tags` fields. Omit to skip term
707
+ * wiring (the report surfaces a one-line note about it).
708
+ */
709
+ taxonomies?: TaxonomyResolver;
710
+ /**
711
+ * Phase 21.7 — when supplied, every newly created post's WP
712
+ * comments are imported as `np_comments` rows, with imported
713
+ * members find-or-created via the deps. Omit to skip comment
714
+ * imports entirely.
715
+ */
716
+ comments?: CommentDeps;
717
+ /**
718
+ * Phase 21.8 — when supplied, the applier resolves each WP
719
+ * author once and stamps the resulting NexPress user id onto
720
+ * the post's `author` relationship field. Without it the
721
+ * dropped-author note continues to surface and posts come in
722
+ * without an author wired.
723
+ */
724
+ authors?: AuthorResolver;
725
+ /**
726
+ * Phase 21.9 — operator-supplied mapping table for custom WP
727
+ * post types. Without an entry here the applier skips the
728
+ * record with a warning. Keys are WP `<wp:post_type>` values
729
+ * (e.g. `"product"`); values declare the NexPress collection to
730
+ * route the record into and an optional `fieldOverrides` map
731
+ * that maps WP post-meta keys onto NexPress collection field
732
+ * names. Built-in post / page mappings are always applied first
733
+ * and take precedence — this option only widens the routing
734
+ * table, it doesn't override the defaults.
735
+ */
736
+ collectionMappings?: Record<string, CollectionMapping>;
737
+ /**
738
+ * Phase 21.10 — when supplied, the applier emits an audit event
739
+ * for every document it writes (action `import.wp.applied`),
740
+ * skips for already-imported slugs (`import.wp.skipped`), and
741
+ * record-level errors (`import.wp.error`). The shim wires this
742
+ * to `recordAuditEvent` from `@nexpress/core` so the entries
743
+ * land in `np_audit_events` alongside the rest of the operator
744
+ * trail. Audit failures NEVER abort the import — see the
745
+ * deps's contract.
746
+ */
747
+ audit?: AuditDeps;
748
+ /**
749
+ * Phase 21.11 — when supplied, the applier writes the original
750
+ * WP author display name (or login when no display name is set)
751
+ * to the named field on every imported document, so the byline
752
+ * is preserved even when `--no-create-authors` strips the
753
+ * `np_users` link. Operators add a matching `text` field to
754
+ * their collection and declare the mapping here, e.g.:
755
+ *
756
+ * { posts: "wpOriginalAuthor" }
757
+ *
758
+ * Collections without an entry skip the field write — the
759
+ * applier only touches columns the operator opted into, so
760
+ * existing schemas keep round-tripping unchanged.
761
+ */
762
+ preserveOriginalAuthor?: Record<string, string>;
763
+ /**
764
+ * Phase 21.14 — when supplied, the applier reads + writes a
765
+ * resume marker so re-runs skip work that already landed.
766
+ * Documents are matched by `(collection, slug)`, comments by WP
767
+ * comment id. The marker is persisted after each record-level
768
+ * success; a crash mid-import resumes from the last persisted
769
+ * row instead of starting over.
770
+ */
771
+ resume?: ResumeDeps;
772
+ /**
773
+ * Phase 21.12 — when true, the applier rewrites content for
774
+ * documents whose slug already exists instead of skipping them.
775
+ * The existing `np_c_*` row keeps its id (so revisions and
776
+ * `np_media_refs` pointers stay intact); the `data` payload —
777
+ * title, content, excerpt, coverImage, taxonomies, author,
778
+ * publishedAt — is overwritten. Comments are NOT re-imported on
779
+ * an update pass; existing rows under that document stay put.
780
+ * Without the flag the historical skip-on-collision behavior
781
+ * holds.
782
+ */
783
+ update?: boolean;
784
+ /**
785
+ * Phase 21.12 — when true, downstream warnings escalate to
786
+ * `errors` so the CLI exits non-zero. Specifically: any media
787
+ * pipeline error (4xx, MIME reject, missing attachment) and
788
+ * any taxonomy / author resolver failure becomes a record-level
789
+ * error rather than a soft note. Useful for migration scripts
790
+ * that need a clean import or nothing — the operator wants the
791
+ * pipeline to abort rather than silently skip an asset.
792
+ */
793
+ strict?: boolean;
794
+ /**
795
+ * Phase 21.12 — when supplied, the applier emits a side-by-side
796
+ * conversion sample for every imported record so an operator can
797
+ * spot-check the WP HTML → Lexical roundtrip. The deps object
798
+ * receives the source content + the resulting Lexical AST; the
799
+ * shim writes them out as an HTML diff page next to the WXR.
800
+ */
801
+ reportHtml?: ReportHtmlDeps;
802
+ }
803
+ interface ReportHtmlDeps {
804
+ emit: (sample: {
805
+ wpId: number;
806
+ wpType: string;
807
+ slug: string;
808
+ title: string;
809
+ rawContent: string;
810
+ lexical: LexicalRoot;
811
+ }) => void;
812
+ }
813
+ interface AuditDeps {
814
+ record: (event: {
815
+ action: string;
816
+ targetType?: string;
817
+ targetId?: string;
818
+ payload?: Record<string, unknown>;
819
+ }) => Promise<void>;
820
+ }
821
+ interface CollectionMapping {
822
+ collection: string;
823
+ /**
824
+ * Maps WP `<wp:postmeta>` keys to NexPress collection field
825
+ * names. Each mapped meta value is copied verbatim onto the
826
+ * document data; values are not coerced (the framework's Zod
827
+ * validation will reject mismatched types and surface a
828
+ * per-record error in the report).
829
+ */
830
+ fieldOverrides?: Record<string, string>;
831
+ }
832
+ interface AppliedRow {
833
+ wpId: number;
834
+ wpType: string;
835
+ collection: string;
836
+ slug: string;
837
+ title: string;
838
+ /** Set when the post had a featured image and the applier wired `coverImage`. */
839
+ coverImageId?: string;
840
+ /** Phase 21.6 — taxonomy ids attached to this row's `categories` field. */
841
+ categoryIds?: string[];
842
+ /** Phase 21.6 — taxonomy ids attached to this row's `tags` field. */
843
+ tagIds?: string[];
844
+ /** Phase 21.8 — NexPress user id stamped onto the row's `author` field. */
845
+ authorId?: string;
846
+ }
847
+ interface SkippedRow {
848
+ wpId: number;
849
+ wpType: string;
850
+ slug: string;
851
+ reason: string;
852
+ }
853
+ interface ApplyReport {
854
+ applied: AppliedRow[];
855
+ skipped: SkippedRow[];
856
+ errors: Array<{
857
+ wpId: number;
858
+ slug: string;
859
+ message: string;
860
+ }>;
861
+ /**
862
+ * Attachment index built during apply. Phase 21.5 picks it up
863
+ * to drive the media download/upload pipeline; surfaces here so
864
+ * the CLI can show a meaningful summary even on a dry run.
865
+ */
866
+ attachments: AttachmentIndex;
867
+ /**
868
+ * Phase 21.5 media-pipeline summary. `null` when the caller did
869
+ * not supply a `media` deps object — the report still renders
870
+ * cleanly with a "media pipeline not run" line.
871
+ */
872
+ media: MediaPipelineReport | null;
873
+ /**
874
+ * Phase 21.6 — resolved-taxonomy summary. `null` when the caller
875
+ * didn't supply a `taxonomies` resolver. Useful for audits and
876
+ * for the CLI to render the term-resolution outcome.
877
+ */
878
+ taxonomies: TaxonomyResolution | null;
879
+ /** Phase 21.7 — comment import outcome. `null` when no comments deps. */
880
+ comments: CommentImportPlan | null;
881
+ /**
882
+ * Phase 21.8 — resolved-author summary. `null` when the caller
883
+ * didn't supply an `authors` resolver.
884
+ */
885
+ authors: AuthorResolution | null;
886
+ /**
887
+ * One-time observations the operator should know about — drops
888
+ * we made silently per-record but want surfaced once aggregated.
889
+ * Examples: original authors dropped (21.8), `private` status
890
+ * coerced to draft (design §11.5).
891
+ */
892
+ notes: string[];
893
+ }
894
+ declare function applyBundle(bundle: WpImportBundle, options: ApplyOptions): Promise<ApplyReport>;
895
+
896
+ /**
897
+ * Phase 21.3 — `wp-import` CLI runner.
898
+ *
899
+ * Returns an exit code instead of calling `process.exit` so the
900
+ * shim (or a test harness) can decide whether to terminate. The
901
+ * shim under `apps/web/scripts/wp-import.ts` does the actual exit.
902
+ *
903
+ * Today every invocation is a dry run because the applier doesn't
904
+ * exist yet (Phase 21.4 lands content writes; 21.5 lands media).
905
+ * `--dry-run` defaults to true; passing `--dry-run=false` prints a
906
+ * useful "not implemented yet" message rather than silently doing
907
+ * nothing.
908
+ */
909
+ interface CliIo {
910
+ stdout: (line: string) => void;
911
+ stderr: (line: string) => void;
912
+ }
913
+ /**
914
+ * Hooks the shim plumbs in so the CLI can run the applier without
915
+ * directly depending on the framework's bootstrap. When omitted —
916
+ * e.g. CI runs `runCli` from a unit test — the CLI prints the
917
+ * dry-run summary and ignores `--apply`.
918
+ */
919
+ interface CliApplyHooks {
920
+ applyBundle: (bundle: WpImportBundle, ctx: {
921
+ actor: NpAuthUser;
922
+ dryRun: boolean;
923
+ log: (line: string) => void;
924
+ /** Phase 21.8 — true when the operator passed `--no-create-authors`. */
925
+ createAuthors: boolean;
926
+ /**
927
+ * Phase 21.9 — operator-supplied custom-post-type mappings.
928
+ * Empty object when no `--config` file was passed.
929
+ */
930
+ collectionMappings: Record<string, CollectionMapping>;
931
+ /** Phase 21.12 — set when the operator passed `--strict`. */
932
+ strict: boolean;
933
+ /** Phase 21.12 — set when the operator passed `--update`. */
934
+ update: boolean;
935
+ /**
936
+ * Phase 21.12 — when set, the operator wants a side-by-side
937
+ * HTML/Lexical diff written somewhere. The CLI passes the
938
+ * resolved file path; the shim is responsible for opening
939
+ * the file and producing the deps. `null` means "don't emit".
940
+ */
941
+ reportHtmlPath: string | null;
942
+ /**
943
+ * Phase 21.14 — when set, the operator wants a resume marker
944
+ * loaded + persisted at the named path. `null` means
945
+ * "don't load or write a marker".
946
+ */
947
+ resumeStatePath: string | null;
948
+ }) => Promise<ApplyReport>;
949
+ resolveActor: () => Promise<NpAuthUser>;
950
+ }
951
+ declare function runCli(argv: string[], io?: CliIo, hooks?: CliApplyHooks): Promise<number>;
952
+
953
+ /**
954
+ * Phase 21.3 — render a parsed `WpImportBundle` as a human-readable
955
+ * summary. Pure function (no IO) so unit tests can pin the exact
956
+ * shape of the operator-facing output.
957
+ *
958
+ * Format priorities, in order:
959
+ * 1. Confirm what the parser found (counts, site identity).
960
+ * 2. Surface anything an operator should act on before applying
961
+ * (CPTs without mappings, attachments that need media work,
962
+ * authors that will get auto-created in 21.8).
963
+ * 3. Stay narrow — no terminal-width assumptions, ASCII only.
964
+ *
965
+ * The output is intentionally line-by-line predictable so this
966
+ * doubles as a CI fixture in future sub-phases.
967
+ */
968
+ declare function formatSummary(args: {
969
+ bundle: WpImportBundle;
970
+ sourcePath: string;
971
+ dryRun: boolean;
972
+ }): string;
973
+ /**
974
+ * Phase 21.4 — render the result of `applyBundle()`. Operator-
975
+ * facing summary printed after the apply pass finishes.
976
+ */
977
+ declare function formatApplyReport(report: ApplyReport, args: {
978
+ dryRun: boolean;
979
+ }): string;
980
+
981
+ /**
982
+ * Phase 21.9 — parse + validate a wp-import config file.
983
+ *
984
+ * The config is a small JSON document. Example:
985
+ *
986
+ * {
987
+ * "mappings": [
988
+ * { "wpType": "product", "collection": "products" },
989
+ * {
990
+ * "wpType": "event",
991
+ * "collection": "events",
992
+ * "fieldOverrides": { "_event_date": "eventDate" }
993
+ * }
994
+ * ]
995
+ * }
996
+ *
997
+ * Unknown keys at the top level are ignored so future sub-phases
998
+ * (resume markers, plugin-supplied overrides) can extend the file
999
+ * without breaking older importers.
1000
+ */
1001
+ interface WpImportConfig {
1002
+ collectionMappings: Record<string, CollectionMapping>;
1003
+ }
1004
+ declare class WpImportConfigError extends Error {
1005
+ constructor(message: string);
1006
+ }
1007
+ declare function loadConfigFromPath(path: string): WpImportConfig;
1008
+ declare function parseConfig(source: string, displayPath?: string): WpImportConfig;
1009
+
1010
+ /**
1011
+ * Phase 21.5 — fetch a media file from a WP source URL.
1012
+ *
1013
+ * The function is intentionally narrow:
1014
+ *
1015
+ * - It only HTTP(S) GETs the URL the WXR pointed at; we don't
1016
+ * follow `srcset` or guess `-scaled` variants. WP often has
1017
+ * half a dozen size variants per image and re-deriving them
1018
+ * after upload is cheaper (Sharp pipeline runs server-side
1019
+ * anyway) than mirroring whatever the source happened to
1020
+ * pre-render.
1021
+ * - MIME is sniffed from the response `Content-Type` header,
1022
+ * falling back to `application/octet-stream` if absent.
1023
+ * The pipeline rejects anything not in the allow-list so a
1024
+ * server returning text/html (404 page, redirect intercept,
1025
+ * etc.) doesn't leak through.
1026
+ * - One retry on network/timeout failure. 4xx is terminal
1027
+ * (matches the design doc §6 — 404 is treated as a hard skip).
1028
+ *
1029
+ * SSRF guard (#270, #382):
1030
+ *
1031
+ * - Scheme is restricted to http(s).
1032
+ * - The hostname is resolved via DNS and every returned address
1033
+ * is checked against private / loopback / link-local / CGNAT
1034
+ * / multicast / reserved CIDRs. Any private result rejects
1035
+ * the URL — we don't fall through to the public IPs because
1036
+ * a malicious DNS response can rebind between the check and
1037
+ * the fetch (TOCTOU). For the importer's purposes, hosting
1038
+ * media on a hostname that *also* has a private A record is
1039
+ * vanishingly rare; rejecting it is the safer default.
1040
+ * - The vetted address is then *pinned* on an undici `Agent` so
1041
+ * `fetch` connects to that exact IP instead of re-resolving
1042
+ * the hostname (#382). Without pinning, the preflight DNS
1043
+ * check and the connect-time DNS resolution are independent,
1044
+ * leaving a DNS-rebinding window where a public answer passes
1045
+ * the check and a private answer is what gets connected. SNI /
1046
+ * Host headers stay set to the original hostname so HTTPS cert
1047
+ * validation still works.
1048
+ * - Redirects are followed manually (`redirect: "manual"`),
1049
+ * capped at 3 hops, and each hop re-runs the DNS / private-IP
1050
+ * check AND re-pins the connect address. The platform `fetch`
1051
+ * would otherwise silently follow a public-IP 302 to
1052
+ * `169.254.169.254`.
1053
+ * - `Content-Length` is checked against `maxBytes` *before* the
1054
+ * body is read so a slow-read attacker can't tie the worker up
1055
+ * past the timeout window.
1056
+ *
1057
+ * The `allowPrivateHosts` option exists for tests and for
1058
+ * self-hosted deployments where the WXR is genuinely on the
1059
+ * same private network as the importer.
1060
+ */
1061
+ interface DownloadResult {
1062
+ buffer: Buffer$1;
1063
+ mimeType: string;
1064
+ filename: string;
1065
+ }
1066
+ interface DownloadOptions {
1067
+ /** Override `globalThis.fetch` — used by tests. */
1068
+ fetchImpl?: typeof fetch;
1069
+ /** Override DNS lookup — used by tests to drive private-IP rejection. */
1070
+ dnsLookupImpl?: (hostname: string) => Promise<Array<{
1071
+ address: string;
1072
+ family: number;
1073
+ }>>;
1074
+ /** Per-request timeout in ms. Default 30s. */
1075
+ timeoutMs?: number;
1076
+ /** How many times to retry network/timeout failures before giving up. */
1077
+ retries?: number;
1078
+ /** Maximum redirect hops. Default 3. Each hop re-validates the host. */
1079
+ maxRedirects?: number;
1080
+ /** Maximum response size in bytes. Default 100 MiB. */
1081
+ maxBytes?: number;
1082
+ /**
1083
+ * Skip the private-IP check. ONLY for tests and self-hosted
1084
+ * deployments where the source server lives on the same
1085
+ * private network as the importer.
1086
+ */
1087
+ allowPrivateHosts?: boolean;
1088
+ }
1089
+ declare class WpMediaDownloadError extends Error {
1090
+ readonly url: string;
1091
+ readonly status: number | null;
1092
+ constructor(url: string, message: string, status?: number | null);
1093
+ }
1094
+ /**
1095
+ * Thrown when the URL or any redirect target resolves to a host we
1096
+ * refuse to talk to (private IP, loopback, non-HTTP scheme, etc).
1097
+ * A separate subclass so `downloadMedia`'s retry loop can recognise
1098
+ * it and refuse to retry — re-resolving DNS won't make a `127.0.0.1`
1099
+ * AAAA record any safer.
1100
+ */
1101
+ declare class WpMediaSsrfError extends WpMediaDownloadError {
1102
+ constructor(url: string, message: string);
1103
+ }
1104
+ declare function downloadMedia(url: string, opts?: DownloadOptions): Promise<DownloadResult>;
1105
+ /**
1106
+ * Resolve the SSRF-related download options from process env.
1107
+ * `runMediaPipeline`'s default download wires this in so a
1108
+ * self-hosted operator can opt into private-network imports
1109
+ * without having to pass `deps.download` themselves.
1110
+ *
1111
+ * - `NP_WP_IMPORT_ALLOW_PRIVATE_HOSTS=1` (or `true`) skips the
1112
+ * DNS / private-IP rejection step. ONLY use this when the
1113
+ * source WXR genuinely lives on the same trusted private
1114
+ * network as the importer.
1115
+ * - `NP_WP_IMPORT_MAX_BYTES=<int>` overrides the 100 MiB
1116
+ * body-size cap. Bumping this is the right knob for sites
1117
+ * with large video assets.
1118
+ *
1119
+ * Invalid values (non-numeric `MAX_BYTES`, etc.) fall back to
1120
+ * the secure defaults silently — the goal is "don't refuse to
1121
+ * boot," not "reward typos with bigger surface area."
1122
+ */
1123
+ declare function resolveEnvDownloadOptions(env?: NodeJS.ProcessEnv): DownloadOptions;
1124
+ /**
1125
+ * The framework's upload routes accept image/*, video/*, and
1126
+ * application/pdf. Mirror that here so the importer doesn't push
1127
+ * anything through that the upload route would reject.
1128
+ */
1129
+ declare function isAllowedMimeType(mimeType: string): boolean;
1130
+
1131
+ /**
1132
+ * Phase 21.5 — replace WP source URLs in a Lexical document with
1133
+ * NexPress media references.
1134
+ *
1135
+ * We patch two things on each `image` node:
1136
+ *
1137
+ * 1. `mediaId` — what `extractMediaIds` in
1138
+ * `packages/core/src/media/refs.ts` reads to wire `np_media_refs`
1139
+ * so the document gets blocked from referenced-media deletion.
1140
+ * 2. `src` — what the SSR renderer at
1141
+ * `packages/editor/src/render-rich-text.tsx` looks at when
1142
+ * drawing the `<img>`. Setting both keeps the rendered HTML
1143
+ * stable while also routing through the framework's media
1144
+ * tracking. The applier sets `src` to the storage URL that the
1145
+ * media adapter eventually resolves; for now we leave it as the
1146
+ * WP source URL so themes that haven't migrated to media-id
1147
+ * resolution still render something.
1148
+ *
1149
+ * URLs that aren't in the resolution map (download 404, MIME
1150
+ * rejected, etc.) are left untouched. Themes will render those as
1151
+ * broken images — same outcome as the design doc §6 prescribes.
1152
+ */
1153
+ declare function rewriteLexicalMedia(root: LexicalRoot, resolution: MediaResolution): LexicalRoot;
1154
+
1155
+ export { type AppliedRow, type ApplyOptions, type ApplyReport, type AttachmentEntry, type AttachmentIndex, type AuditDeps, type AuthorResolution, type AuthorResolveInput, type AuthorResolver, type CliApplyHooks, type CliIo, type CollectionMapping, type CommentDeps, type CommentImportPlan, type CommentInsertInput, type DownloadOptions, type DownloadResult, type ImportedMemberInput, type LexicalRoot, type MediaPipelineDeps, type MediaPipelineError, type MediaPipelineOptions, type MediaPipelineReport, type MediaResolution, type MediaUploadInput, type ReportHtmlDeps, type ResumeDeps, type ResumeState, ResumeStateError, type SkippedRow, type TaxonomyKey, type TaxonomyResolution, type TaxonomyResolver, type WpAuthor, type WpComment, type WpImportBundle, type WpImportConfig, WpImportConfigError, type WpImportRecord, type WpImportStream, WpImportStreamError, type WpImportStreamHeader, type WpImportStreamOptions, WpMediaDownloadError, type WpMediaRef, WpMediaSsrfError, type WpPostStatus, type WpSiteInfo, type WpTerm, applyBundle, buildAttachmentIndex, documentKey, downloadMedia, emptyCommentPlan, emptyResumeState, formatApplyReport, formatSummary, htmlToLexical, importPostComments, isAllowedMimeType, loadConfigFromPath, loadResumeState, parseConfig, parseWxr, parseWxrStream, persistResumeState, pickPostTermIds, resolveAuthors, resolveEnvDownloadOptions, resolveTaxonomies, rewriteLexicalMedia, runCli, runMediaPipeline, termCacheKey };