npm - @nexpress/wp-import - Versions diffs - 0.1.0 - Mend

@nexpress/wp-import 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,1155 @@
+import { NpAuthUser } from '@nexpress/core';
+import { Buffer as Buffer$1 } from 'node:buffer';
+/**
+ * Phase 21.2 — Intermediate Record (IR) types.
+ *
+ * The IR is the seam between the WXR parser and the importer. The
+ * design doc (`docs/design/wordpress-import-design.md` §4) frames why this
+ * lives between parse and apply: keeps the parser free of NexPress
+ * concerns and lets future adapters (Ghost, Drupal, generic JSON)
+ * plug into the same applier.
+ *
+ * Field names mirror the WXR XML tag names where it doesn't hurt
+ * readability (`wpId` for `<wp:post_id>`, `wpType` for
+ * `<wp:post_type>`) so reviewers cross-checking against a real
+ * export can find the correspondence quickly.
+ */
+type WpPostStatus = "publish" | "draft" | "private" | "pending" | "trash" | "auto-draft";
+/** A category, post tag, or any custom WP taxonomy term attached to a post. */
+interface WpTerm {
+    /** "category", "post_tag", or any custom taxonomy slug. */
+    taxonomy: string;
+    slug: string;
+    name: string;
+}
+/**
+ * A media reference parsed out of a post. Resolved later by the
+ * media pipeline (Phase 21.5) into actual np_media ids.
+ */
+interface WpMediaRef {
+    /** Source URL on the WP site (e.g. `https://site.com/wp-content/uploads/.../foo.jpg`). */
+    sourceUrl: string;
+    /** WP attachment id when the reference points at one we know about. */
+    wpAttachmentId: number | null;
+    /**
+     * Where this reference came from in the original document — drives
+     * how the applier wires the result. `featured` lands on the post's
+     * `coverImage` field; `inline` rewrites the body content.
+     */
+    kind: "featured" | "inline";
+}
+interface WpComment {
+    wpId: number;
+    /** Parent comment id when the comment is a reply, else null. */
+    parentWpId: number | null;
+    authorName: string;
+    authorEmail: string | null;
+    authorUrl: string | null;
+    /** ISO timestamp from <wp:comment_date_gmt>. */
+    date: string;
+    /** Comment body — usually plain text but can contain HTML. */
+    content: string;
+    /** Maps from <wp:comment_approved> ("1" → true, anything else → false). */
+    approved: boolean;
+}
+/**
+ * One post / page / custom post type record in the WXR. The applier
+ * walks an array of these to write content into NexPress collections.
+ */
+interface WpImportRecord {
+    /** Numeric id from <wp:post_id>. Stable across re-exports of the same WP site. */
+    wpId: number;
+    /** "post" | "page" | custom post type slug. Drives applier collection routing. */
+    wpType: string;
+    status: WpPostStatus;
+    slug: string;
+    title: string;
+    /** From <excerpt:encoded>. Null when WP didn't write one. */
+    excerpt: string | null;
+    /**
+     * Raw HTML / Gutenberg content from <content:encoded>. Phase 21.4
+     * runs this through the HTML→Lexical converter; this PR keeps it
+     * as the unmodified string so parser tests stay deterministic.
+     */
+    rawContent: string;
+    /** WP author id from <dc:creator> (resolved against parsed authors). */
+    wpAuthorLogin: string;
+    /** ISO timestamp from <wp:post_date_gmt>. */
+    publishedAt: string;
+    /** ISO from <wp:post_modified_gmt>. */
+    updatedAt: string;
+    terms: WpTerm[];
+    /** Resolved <wp:postmeta> entries. Keys preserved verbatim. */
+    meta: Record<string, string>;
+    mediaRefs: WpMediaRef[];
+    comments: WpComment[];
+}
+interface WpAuthor {
+    wpId: number;
+    /** WP login slug — e.g. "alice". */
+    login: string;
+    email: string;
+    displayName: string;
+    /** Free-form bio / description from <wp:author_description>. */
+    description: string | null;
+}
+/** Site-level metadata harvested from the <channel> envelope. */
+interface WpSiteInfo {
+    /** <title>. */
+    title: string;
+    /** <link>. */
+    link: string;
+    /** <description>. */
+    description: string;
+    /** <wp:base_site_url>. */
+    baseSiteUrl: string;
+    /** <wp:base_blog_url>. Often the same as baseSiteUrl on single-site WP. */
+    baseBlogUrl: string;
+    /** <language>, e.g. "en-US". */
+    language: string | null;
+}
+/**
+ * The full output of `parseWxr()`. Captures everything an applier
+ * needs in one in-memory shape.
+ */
+interface WpImportBundle {
+    site: WpSiteInfo;
+    authors: WpAuthor[];
+    /**
+     * All records in document order. Includes `attachment` post types
+     * (used to resolve media refs) — the applier filters those out
+     * after the media pipeline runs.
+     */
+    records: WpImportRecord[];
+    /**
+     * Standalone <wp:category> / <wp:tag> / <wp:term> entries from the
+     * channel envelope. Most WP exports duplicate these into per-post
+     * <category> elements too; the applier de-dupes.
+     */
+    terms: WpTerm[];
+}
+/**
+ * Phase 21.2 — WXR parser. Reads a WordPress eXtended RSS export
+ * (the file produced by Tools → Export in wp-admin) and produces
+ * an in-memory bundle of typed Intermediate Records.
+ *
+ * The parser is deliberately tolerant:
+ *
+ *   - Missing namespace-prefixed tags (`wp:`, `dc:`, `content:`,
+ *     `excerpt:`) fall back to sensible defaults rather than
+ *     throwing, because real-world WXR files vary in how strictly
+ *     they're written by WP plugins.
+ *   - <category> elements appear in two shapes (channel-level
+ *     <wp:category> and per-post <category>); we handle both.
+ *   - The CDATA wrapping around <content:encoded> is unwrapped
+ *     transparently by `fast-xml-parser`.
+ *
+ * NOT in this PR: streaming parse for huge exports (Phase 21.10),
+ * media downloading (21.5), HTML-to-Lexical conversion (21.4).
+ */
+declare function parseWxr(xml: string): WpImportBundle;
+/**
+ * Phase 21.16 — streaming WXR parse for huge exports.
+ *
+ * The eager `parseWxr(string)` path stays as the canonical small-
+ * export tool: it loads the whole file into memory which is fine
+ * up to a few hundred megabytes. For 5 GB+ WXR exports (large WP
+ * sites with embedded media), eager parsing OOMs the process, so
+ * this module provides an async-iterable equivalent that:
+ *
+ *   1. Reads the file in chunks via `createReadStream`.
+ *   2. Loads the channel header (site / authors / channel-level
+ *      terms) once it's fully buffered. The header is small —
+ *      WP writes it before the first `<item>` and it's bounded by
+ *      the author count, not the post count.
+ *   3. Scans the rolling buffer for `<item>...</item>` slices and
+ *      hands each to the existing `parseWxr` machinery wrapped in
+ *      a tiny `<channel>` envelope so the parser still recognises
+ *      the structure.
+ *   4. Yields one `WpImportRecord` per item; peak memory stays
+ *      bounded by the largest single item rather than the whole
+ *      file.
+ *
+ * Limitations:
+ *
+ *   - Items larger than the buffer threshold (default 64 MB) are
+ *     surfaced as an explicit error rather than silently growing
+ *     the buffer; that's the early-warning signal that a record's
+ *     `<content:encoded>` is degenerate (a 60 MB embedded base64
+ *     image, for example) and needs the operator to intervene.
+ *   - This is a structural streamer only — the applier still
+ *     consumes the records eagerly today. The Phase 21.16 cut
+ *     gives operators the primitive; a future sub-phase wires it
+ *     end-to-end through `applyBundle` so peak memory stays low
+ *     for the full pipeline too.
+ */
+interface WpImportStreamHeader {
+    site: WpSiteInfo;
+    authors: WpAuthor[];
+    /** Channel-level <wp:category> / <wp:tag> / <wp:term> entries. */
+    terms: WpTerm[];
+}
+interface WpImportStream {
+    header: WpImportStreamHeader;
+    items: AsyncIterable<WpImportRecord>;
+}
+interface WpImportStreamOptions {
+    /** Read-stream chunk size in bytes. Default 64 KB. */
+    highWaterMark?: number;
+    /**
+     * Hard cap on the rolling buffer between item boundaries — when
+     * a single `<item>...</item>` slice exceeds this we abort
+     * rather than growing memory unbounded. Default 64 MB.
+     */
+    maxItemBytes?: number;
+}
+declare class WpImportStreamError extends Error {
+    constructor(message: string);
+}
+/**
+ * Open a streaming reader over a WXR file. Resolves with the
+ * channel header + an async iterator of items.
+ */
+declare function parseWxrStream(path: string, options?: WpImportStreamOptions): Promise<WpImportStream>;
+/**
+ * Phase 21.4 — HTML → Lexical AST.
+ *
+ * Resolves design doc §11.1: roll our own minimal converter rather
+ * than wrap a heavier ecosystem library. Rationale:
+ *
+ *   - The set of WP HTML constructs we care about is small —
+ *     paragraphs, headings, lists, blockquote, basic inline
+ *     formatting, links, images, line breaks, hr, code.
+ *   - `node-html-parser` gives us a lenient HTML5 parse with no
+ *     DOM dependency (works in Node and edge runtimes).
+ *   - The Lexical node shape is well-documented in the framework's
+ *     own renderer (`packages/editor/src/render-rich-text.tsx`),
+ *     so we emit exactly what the renderer expects.
+ *
+ * NOT in this PR:
+ *   - Gutenberg block-comment syntax (`<!-- wp:paragraph -->`).
+ *     Treat the comments as text noise; Phase 21.4b can layer
+ *     block awareness on top once we have a real WP fixture to
+ *     validate against.
+ *   - Image media-id resolution. <img> nodes are emitted with
+ *     the source URL as `src`; Phase 21.5 swaps these to
+ *     NexPress media ids after the upload pipeline runs.
+ *   - Custom shortcodes (`[gallery]`, etc.). Out of scope per
+ *     design doc §2 — handled by the long-tail conversion pass.
+ */
+interface LexicalRoot {
+    root: {
+        type: "root";
+        direction: null;
+        format: "";
+        indent: 0;
+        version: 1;
+        children: LexicalBlock[];
+    };
+}
+interface LexicalBlock {
+    type: string;
+    version: 1;
+    format: "" | number;
+    indent: number;
+    direction: null;
+    children?: LexicalNode[];
+    tag?: string;
+    listType?: "bullet" | "number";
+    url?: string;
+    src?: string;
+    altText?: string;
+    text?: string;
+}
+type LexicalNode = LexicalBlock;
+/**
+ * Convert raw WP content HTML into a Lexical root document.
+ * Always returns a valid Lexical structure — empty input becomes a
+ * single empty paragraph, matching what the editor would produce
+ * for a freshly-created field.
+ */
+declare function htmlToLexical(html: string): LexicalRoot;
+/**
+ * Phase 21.4 — index attachment records by their WP id so the
+ * applier can resolve `_thumbnail_id` references and inline
+ * `wp-image-N` ids to source URLs.
+ *
+ * Phase 21.5 will replace these source URLs with NexPress media
+ * ids after the actual download/upload pipeline runs. For now
+ * the applier just preserves the URL on the document so the
+ * post body and featured-image references aren't lost.
+ */
+interface AttachmentEntry {
+    wpAttachmentId: number;
+    sourceUrl: string;
+    /** WP attachment post-meta — `_wp_attached_file`, dimensions, etc. */
+    meta: Record<string, string>;
+    /** Original attachment record's title (filename in most exports). */
+    title: string;
+}
+interface AttachmentIndex {
+    /** Look up by numeric WP attachment id. */
+    byId: ReadonlyMap<number, AttachmentEntry>;
+    /** Look up by source URL — useful for inline img refs that didn't carry an id. */
+    byUrl: ReadonlyMap<string, AttachmentEntry>;
+}
+declare function buildAttachmentIndex(bundle: WpImportBundle): AttachmentIndex;
+/**
+ * Phase 21.5 — orchestrates download + upload of every media URL the
+ * import touches.
+ *
+ * Inputs are wired through `MediaPipelineDeps` rather than imported
+ * directly so the unit tests don't need a DB or a network. The CLI
+ * shim under `apps/web/scripts/wp-import.ts` plugs in the real
+ * `uploadMedia` from `@nexpress/core`; tests pass an in-memory stub.
+ *
+ * Resolution map:
+ *
+ *   - `byUrl` — every successfully uploaded source URL → NexPress
+ *     media id. Used to rewrite inline `<img>` src in Lexical.
+ *   - `byAttachmentId` — WP attachment id → media id. Used by the
+ *     applier for `_thumbnail_id` featured-image lookup.
+ *
+ * Phase 21.13 layered two prod-grade concerns on top:
+ *
+ *   - **Per-host concurrency cap.** Each unique URL host gets a
+ *     small fixed-size queue (default 4) so a 50-image post on the
+ *     same wp-content domain doesn't open 50 sockets in parallel.
+ *     URLs from different hosts run independently.
+ *   - **Cross-run hash dedup.** When the caller supplies
+ *     `findExistingByHash`, the pipeline computes the SHA-256 of
+ *     the downloaded bytes and looks the row up by hash before
+ *     uploading. Re-running the importer against the same WXR
+ *     therefore reuses existing `np_media` rows instead of
+ *     producing byte-identical duplicates.
+ */
+interface MediaUploadInput {
+    buffer: Buffer;
+    originalFilename: string;
+    mimeType: string;
+}
+interface MediaPipelineDeps {
+    /** Fetch the bytes for a source URL. Defaults to `downloadMedia`. */
+    download?: (url: string) => Promise<{
+        buffer: Buffer;
+        mimeType: string;
+        filename: string;
+    }>;
+    /**
+     * Push a downloaded blob through the framework's media service.
+     * Returns the new media row id.
+     */
+    upload: (input: MediaUploadInput) => Promise<{
+        id: string;
+    }>;
+    /**
+     * Phase 21.13 — when supplied, the pipeline asks the caller to
+     * look up an existing `np_media` row by SHA-256 hash before
+     * uploading. Returning a row reuses it (cross-run idempotency);
+     * returning null falls through to upload as normal.
+     */
+    findExistingByHash?: (sha256: string) => Promise<{
+        id: string;
+    } | null>;
+}
+interface MediaResolution {
+    byUrl: Map<string, string>;
+    byAttachmentId: Map<number, string>;
+}
+interface MediaPipelineError {
+    url: string;
+    reason: string;
+}
+interface MediaPipelineReport {
+    resolution: MediaResolution;
+    uploaded: number;
+    skipped: number;
+    /** Phase 21.13 — count of URLs whose bytes matched an existing media row. */
+    reused: number;
+    errors: MediaPipelineError[];
+}
+interface MediaPipelineOptions {
+    /** Set true to walk URLs without downloading or uploading anything. */
+    dryRun?: boolean;
+    /** Optional progress sink. */
+    log?: (line: string) => void;
+    /**
+     * Phase 21.13 — concurrent-download cap per source-URL host.
+     * Default 4 mirrors the design doc §6 recommendation; tests pass
+     * 1 to keep ordering deterministic.
+     */
+    perHostConcurrency?: number;
+}
+/**
+ * Walk every record's media refs + the attachment index, fetch each
+ * unique URL once, and stamp the new media id back into the
+ * resolution map.
+ *
+ * The pipeline is best-effort: a single 404 doesn't abort the run.
+ * The error list surfaces what was missed so the operator can chase
+ * it up. Callers (the applier) treat unresolved URLs as "leave the
+ * Lexical src as-is and render a broken link" — same as design §6.
+ */
+declare function runMediaPipeline(bundle: WpImportBundle, attachments: AttachmentIndex, deps: MediaPipelineDeps, options?: MediaPipelineOptions): Promise<MediaPipelineReport>;
+/**
+ * Phase 21.8 — wire WP authors to NexPress staff users.
+ *
+ * The applier hands a callback the parsed author triple
+ * `(login, email, displayName)` and expects either a NexPress user
+ * id or `null` (skip — post lands without an author and is
+ * attributed to the import actor by default).
+ *
+ * Default semantics on the shim side (design §7):
+ *
+ *   - Find a user by `login` (matched against the `name` column or
+ *     a stored handle); if missing, create one with role `viewer`
+ *     and an email flagged so it doesn't collide with a real
+ *     account (`<original>+wp-import@<domain>`).
+ *   - The opt-out branch (CLI `--no-create-authors`) returns null
+ *     for every author, so posts get the import actor.
+ */
+interface AuthorResolveInput {
+    /** From <dc:creator> on the WP post. */
+    wpAuthorLogin: string;
+    /** The matching WXR <wp:author> entry, when one exists. May be undefined. */
+    wpAuthor: WpAuthor | undefined;
+}
+interface AuthorResolver {
+    resolveAuthor: (input: AuthorResolveInput) => Promise<{
+        id: string;
+    } | null>;
+}
+interface AuthorResolution {
+    /** WP login → NexPress user id. */
+    authorIds: Map<string, string>;
+    /** Logins the resolver explicitly skipped (returned null). */
+    skipped: string[];
+    /** Logins where the resolver threw. */
+    errors: Array<{
+        login: string;
+        reason: string;
+    }>;
+}
+/**
+ * Resolve every unique WP author login that appears on a non-
+ * attachment record once. Returns the lookup the applier uses when
+ * stamping `data.author` per record.
+ */
+declare function resolveAuthors(bundle: WpImportBundle, resolver: AuthorResolver): Promise<AuthorResolution>;
+/**
+ * Phase 21.14 — resume marker.
+ *
+ * The applier writes the ids of every successfully imported entity
+ * to a sidecar JSON file next to the WXR. On re-run the file is
+ * read first, and the applier:
+ *
+ *   - Skips documents whose `(collection, slug)` is already in the
+ *     marker — the import respects the previous run's mapping
+ *     instead of re-querying the DB.
+ *   - Skips comments whose `wpCommentId` is already in the marker
+ *     — closes the design-§9 idempotency gap (re-runs no longer
+ *     create duplicate `np_comments` rows).
+ *   - Reuses media-id mappings so a partial-failure mid-pipeline
+ *     doesn't have to re-download the bytes that already landed.
+ *
+ * The marker is opt-in: callers without a `resume` deps run the
+ * historical "skip on slug" behavior. Operators who want crash-
+ * recovery pass `--resume` to the CLI.
+ *
+ * Schema is versioned so future shape changes can migrate
+ * forward; today's shape is `version: 1`.
+ */
+interface ResumeState {
+    version: 1;
+    /** WXR path the marker was first written for — sanity check. */
+    source: string;
+    startedAt: string;
+    updatedAt: string;
+    /** `${collection}/${slug}` → NexPress doc id. */
+    documents: Record<string, string>;
+    /** WP comment id (numeric) → NexPress np_comments.id. */
+    comments: Record<number, string>;
+    /** WP author login → NexPress np_users.id. */
+    authors: Record<string, string>;
+    /** WP attachment URL → NexPress np_media.id. */
+    media: Record<string, string>;
+    /** `${taxonomy}:${slug}` → NexPress taxonomy term id. */
+    taxonomies: Record<string, string>;
+}
+interface ResumeDeps {
+    state: ResumeState;
+    /**
+     * Persist the current state to disk. Called after each record-
+     * level success so a crash mid-import doesn't lose work. Errors
+     * are surfaced — a marker that can't be written defeats the
+     * whole point.
+     */
+    persist: () => void;
+}
+declare function emptyResumeState(source: string): ResumeState;
+declare class ResumeStateError extends Error {
+    constructor(message: string);
+}
+/**
+ * Load a resume state file from disk. Missing file → fresh state.
+ * Malformed file → throw so the operator can decide whether to
+ * delete the marker or fix it (deletion silently in the importer
+ * would erase the prior run's progress, which is the wrong default).
+ */
+declare function loadResumeState(path: string, source: string): ResumeState;
+declare function persistResumeState(path: string, state: ResumeState): void;
+declare function documentKey(collection: string, slug: string): string;
+/**
+ * Phase 21.7 — wire WP comments into NexPress's `np_comments` plus
+ * per-author imported members.
+ *
+ * The applier owns sequencing — it inserts each post first, then
+ * walks the post's comments and asks the caller's hooks to:
+ *
+ *   1. Resolve a comment author to a NexPress member id, creating
+ *      one with `status='imported'` if needed.
+ *   2. Insert the comment row directly (no spam/profanity check,
+ *      no notification fan-out — it's archived content).
+ *
+ * Comments that aren't approved in the WXR (`<wp:comment_approved>`
+ * != "1") are dropped. Reply parents are resolved within the same
+ * post: WP comment ids form a per-post tree, so we walk the comment
+ * list in id-ascending order and consult an in-memory map.
+ */
+interface ImportedMemberInput {
+    handle: string;
+    email: string | null;
+    displayName: string;
+}
+interface CommentInsertInput {
+    targetType: string;
+    targetId: string;
+    parentId: string | null;
+    memberId: string;
+    bodyMd: string;
+    bodyHtml: string;
+    createdAt: Date;
+}
+interface CommentDeps {
+    ensureImportedMember: (input: ImportedMemberInput) => Promise<{
+        id: string;
+    }>;
+    insertComment: (input: CommentInsertInput) => Promise<{
+        id: string;
+    }>;
+    /**
+     * Render a comment body to safe HTML. The shim plugs in the
+     * framework's `renderCommentMarkdown`; tests pass a passthrough.
+     */
+    renderBody: (source: string) => string;
+}
+interface CommentImportPlan {
+    applied: number;
+    skippedUnapproved: number;
+    skippedNoMember: number;
+    /** Phase 21.14 — comments the resume marker said were already imported. */
+    skippedByResume: number;
+    errors: Array<{
+        wpCommentId: number;
+        reason: string;
+    }>;
+}
+/**
+ * Walk a record's comments and import them. Mutates `plan` in place.
+ * Returns when the record's comments have all been processed.
+ */
+declare function importPostComments(args: {
+    record: WpImportRecord;
+    postId: string;
+    collection: string;
+    deps: CommentDeps;
+    plan: CommentImportPlan;
+    log?: (line: string) => void;
+    /** Phase 21.14 — when supplied, dedupes by `wpCommentId`. */
+    resume?: ResumeDeps;
+}): Promise<void>;
+declare function emptyCommentPlan(): CommentImportPlan;
+/**
+ * Phase 21.6 — wire WP `<category>` and `<post_tag>` term references
+ * onto NexPress posts via a caller-supplied resolver.
+ *
+ * The resolver decides where terms physically live. The reference
+ * app's shim points it at the `taxonomies` collection (one row per
+ * unique `(taxonomy, slug)` pair); user projects with their own
+ * taxonomy storage swap in a different resolver. The wp-import
+ * package never reaches into a collection by name.
+ *
+ * Behavior:
+ *
+ *   - The applier collects every term across the whole bundle, runs
+ *     each through the resolver once, then maps back per-post.
+ *   - A resolver that throws or returns `null` for a given term is
+ *     captured as an error/note in the report; the post still
+ *     imports without that term wired.
+ *   - Without a resolver the applier records a single notes line
+ *     (one per import) — no per-record noise — and posts go in
+ *     without `categories` / `tags` set.
+ */
+interface TaxonomyKey {
+    taxonomy: string;
+    slug: string;
+    name: string;
+}
+interface TaxonomyResolver {
+    /**
+     * Look up the taxonomy term row by `(taxonomy, slug)`, creating
+     * one if it doesn't exist. Returns the row's NexPress id, or
+     * `null` if the resolver decided to skip this term (e.g. the
+     * project doesn't track that taxonomy).
+     */
+    findOrCreate: (input: TaxonomyKey) => Promise<{
+        id: string;
+    } | null>;
+}
+interface TaxonomyResolution {
+    /** `taxonomy:slug` → NexPress term id. */
+    termIds: Map<string, string>;
+    /** Resolver failures. */
+    errors: Array<{
+        key: TaxonomyKey;
+        reason: string;
+    }>;
+    /** Terms the resolver explicitly skipped (returned null). */
+    skipped: TaxonomyKey[];
+}
+declare function termCacheKey(taxonomy: string, slug: string): string;
+/**
+ * Walk every record's terms and the channel-level term list,
+ * collapse them to a unique set, and resolve each through the
+ * caller's resolver. Returns a lookup the applier can use when
+ * building per-record `categories` / `tags` field values.
+ */
+declare function resolveTaxonomies(records: WpImportRecord[], channelTerms: WpTerm[], resolver: TaxonomyResolver): Promise<TaxonomyResolution>;
+/**
+ * Per-record helper — returns the resolved `(categoryIds, tagIds)`
+ * pair for a record. Anything outside the two built-in WP taxonomies
+ * is dropped; the applier mirrors WP's own admin which only renders
+ * `category` and `post_tag` on the post edit screen.
+ */
+declare function pickPostTermIds(record: WpImportRecord, resolution: TaxonomyResolution): {
+    categoryIds: string[];
+    tagIds: string[];
+};
+/**
+ * Phase 21.4 — write posts and pages from a parsed bundle into
+ * NexPress collections via the framework's `saveDocument` pipeline.
+ *
+ * Phase 21.5 layer: when a `media` deps object is supplied, the
+ * applier first walks every attachment + inline `<img>` URL through
+ * the media download/upload pipeline. The resulting URL → media-id
+ * map is then stitched into Lexical body content and the post's
+ * `coverImage` upload field. Without `media.deps` the applier still
+ * runs end-to-end, but image refs render as raw source-URL `<img>`
+ * tags (the pre-21.5 behavior).
+ *
+ * Phase 21.6 layer: when a `taxonomies` resolver is supplied, the
+ * applier resolves every WP `<category>` / `<post_tag>` term once
+ * through the resolver and stamps the resulting NexPress term ids
+ * onto each post's `categories` / `tags` relationship fields. The
+ * reference app's shim points the resolver at a `taxonomies`
+ * collection; user projects with their own taxonomy storage swap
+ * in their own resolver. Without a resolver the applier records a
+ * single notes line and posts go in without their terms wired.
+ *
+ * Phase 21.7 layer: when a `comments` deps object is supplied, the
+ * applier walks each freshly created post's comments, find-or-
+ * creates an `imported` member per author, and inserts each comment
+ * directly into `np_comments` via the deps. Spam/profanity adapters
+ * and notification fan-out are bypassed — this is archived content,
+ * not new community activity.
+ *
+ * Phase 21.8 layer: when an `authors` resolver is supplied, every
+ * unique `<dc:creator>` login is resolved once into a NexPress
+ * user id and stamped onto the post's `author` relationship field.
+ * The shim's default resolver creates a `role: "viewer"` user with
+ * a flagged email so the operator can promote them after the
+ * import. The `--no-create-authors` opt-out swaps in a resolver
+ * that returns `null` for every login — posts then go in without
+ * an author and the import actor takes the credit via
+ * `createdBy` / `updatedBy`.
+ *
+ * Out of scope here: custom post types (21.9). Records of those
+ * types are still skipped with a one-line note in the report.
+ */
+interface ApplyOptions {
+    /** Staff user that the import is attributed to. Required by `saveDocument`. */
+    actor: NpAuthUser;
+    /** Set true for a dry run — counts what would happen without writing. */
+    dryRun?: boolean;
+    /** Optional sink for per-record progress messages. Defaults to no-op. */
+    log?: (line: string) => void;
+    /**
+     * When supplied, runs the Phase 21.5 media pipeline before writing
+     * any record. Omit to skip media handling entirely (the applier
+     * leaves Lexical image src as the original WP URL).
+     */
+    media?: MediaPipelineDeps;
+    /**
+     * Phase 21.6 — when supplied, the applier resolves every WP
+     * category/tag through this hook and stamps the resulting ids
+     * onto post `categories` / `tags` fields. Omit to skip term
+     * wiring (the report surfaces a one-line note about it).
+     */
+    taxonomies?: TaxonomyResolver;
+    /**
+     * Phase 21.7 — when supplied, every newly created post's WP
+     * comments are imported as `np_comments` rows, with imported
+     * members find-or-created via the deps. Omit to skip comment
+     * imports entirely.
+     */
+    comments?: CommentDeps;
+    /**
+     * Phase 21.8 — when supplied, the applier resolves each WP
+     * author once and stamps the resulting NexPress user id onto
+     * the post's `author` relationship field. Without it the
+     * dropped-author note continues to surface and posts come in
+     * without an author wired.
+     */
+    authors?: AuthorResolver;
+    /**
+     * Phase 21.9 — operator-supplied mapping table for custom WP
+     * post types. Without an entry here the applier skips the
+     * record with a warning. Keys are WP `<wp:post_type>` values
+     * (e.g. `"product"`); values declare the NexPress collection to
+     * route the record into and an optional `fieldOverrides` map
+     * that maps WP post-meta keys onto NexPress collection field
+     * names. Built-in post / page mappings are always applied first
+     * and take precedence — this option only widens the routing
+     * table, it doesn't override the defaults.
+     */
+    collectionMappings?: Record<string, CollectionMapping>;
+    /**
+     * Phase 21.10 — when supplied, the applier emits an audit event
+     * for every document it writes (action `import.wp.applied`),
+     * skips for already-imported slugs (`import.wp.skipped`), and
+     * record-level errors (`import.wp.error`). The shim wires this
+     * to `recordAuditEvent` from `@nexpress/core` so the entries
+     * land in `np_audit_events` alongside the rest of the operator
+     * trail. Audit failures NEVER abort the import — see the
+     * deps's contract.
+     */
+    audit?: AuditDeps;
+    /**
+     * Phase 21.11 — when supplied, the applier writes the original
+     * WP author display name (or login when no display name is set)
+     * to the named field on every imported document, so the byline
+     * is preserved even when `--no-create-authors` strips the
+     * `np_users` link. Operators add a matching `text` field to
+     * their collection and declare the mapping here, e.g.:
+     *
+     *   { posts: "wpOriginalAuthor" }
+     *
+     * Collections without an entry skip the field write — the
+     * applier only touches columns the operator opted into, so
+     * existing schemas keep round-tripping unchanged.
+     */
+    preserveOriginalAuthor?: Record<string, string>;
+    /**
+     * Phase 21.14 — when supplied, the applier reads + writes a
+     * resume marker so re-runs skip work that already landed.
+     * Documents are matched by `(collection, slug)`, comments by WP
+     * comment id. The marker is persisted after each record-level
+     * success; a crash mid-import resumes from the last persisted
+     * row instead of starting over.
+     */
+    resume?: ResumeDeps;
+    /**
+     * Phase 21.12 — when true, the applier rewrites content for
+     * documents whose slug already exists instead of skipping them.
+     * The existing `np_c_*` row keeps its id (so revisions and
+     * `np_media_refs` pointers stay intact); the `data` payload —
+     * title, content, excerpt, coverImage, taxonomies, author,
+     * publishedAt — is overwritten. Comments are NOT re-imported on
+     * an update pass; existing rows under that document stay put.
+     * Without the flag the historical skip-on-collision behavior
+     * holds.
+     */
+    update?: boolean;
+    /**
+     * Phase 21.12 — when true, downstream warnings escalate to
+     * `errors` so the CLI exits non-zero. Specifically: any media
+     * pipeline error (4xx, MIME reject, missing attachment) and
+     * any taxonomy / author resolver failure becomes a record-level
+     * error rather than a soft note. Useful for migration scripts
+     * that need a clean import or nothing — the operator wants the
+     * pipeline to abort rather than silently skip an asset.
+     */
+    strict?: boolean;
+    /**
+     * Phase 21.12 — when supplied, the applier emits a side-by-side
+     * conversion sample for every imported record so an operator can
+     * spot-check the WP HTML → Lexical roundtrip. The deps object
+     * receives the source content + the resulting Lexical AST; the
+     * shim writes them out as an HTML diff page next to the WXR.
+     */
+    reportHtml?: ReportHtmlDeps;
+}
+interface ReportHtmlDeps {
+    emit: (sample: {
+        wpId: number;
+        wpType: string;
+        slug: string;
+        title: string;
+        rawContent: string;
+        lexical: LexicalRoot;
+    }) => void;
+}
+interface AuditDeps {
+    record: (event: {
+        action: string;
+        targetType?: string;
+        targetId?: string;
+        payload?: Record<string, unknown>;
+    }) => Promise<void>;
+}
+interface CollectionMapping {
+    collection: string;
+    /**
+     * Maps WP `<wp:postmeta>` keys to NexPress collection field
+     * names. Each mapped meta value is copied verbatim onto the
+     * document data; values are not coerced (the framework's Zod
+     * validation will reject mismatched types and surface a
+     * per-record error in the report).
+     */
+    fieldOverrides?: Record<string, string>;
+}
+interface AppliedRow {
+    wpId: number;
+    wpType: string;
+    collection: string;
+    slug: string;
+    title: string;
+    /** Set when the post had a featured image and the applier wired `coverImage`. */
+    coverImageId?: string;
+    /** Phase 21.6 — taxonomy ids attached to this row's `categories` field. */
+    categoryIds?: string[];
+    /** Phase 21.6 — taxonomy ids attached to this row's `tags` field. */
+    tagIds?: string[];
+    /** Phase 21.8 — NexPress user id stamped onto the row's `author` field. */
+    authorId?: string;
+}
+interface SkippedRow {
+    wpId: number;
+    wpType: string;
+    slug: string;
+    reason: string;
+}
+interface ApplyReport {
+    applied: AppliedRow[];
+    skipped: SkippedRow[];
+    errors: Array<{
+        wpId: number;
+        slug: string;
+        message: string;
+    }>;
+    /**
+     * Attachment index built during apply. Phase 21.5 picks it up
+     * to drive the media download/upload pipeline; surfaces here so
+     * the CLI can show a meaningful summary even on a dry run.
+     */
+    attachments: AttachmentIndex;
+    /**
+     * Phase 21.5 media-pipeline summary. `null` when the caller did
+     * not supply a `media` deps object — the report still renders
+     * cleanly with a "media pipeline not run" line.
+     */
+    media: MediaPipelineReport | null;
+    /**
+     * Phase 21.6 — resolved-taxonomy summary. `null` when the caller
+     * didn't supply a `taxonomies` resolver. Useful for audits and
+     * for the CLI to render the term-resolution outcome.
+     */
+    taxonomies: TaxonomyResolution | null;
+    /** Phase 21.7 — comment import outcome. `null` when no comments deps. */
+    comments: CommentImportPlan | null;
+    /**
+     * Phase 21.8 — resolved-author summary. `null` when the caller
+     * didn't supply an `authors` resolver.
+     */
+    authors: AuthorResolution | null;
+    /**
+     * One-time observations the operator should know about — drops
+     * we made silently per-record but want surfaced once aggregated.
+     * Examples: original authors dropped (21.8), `private` status
+     * coerced to draft (design §11.5).
+     */
+    notes: string[];
+}
+declare function applyBundle(bundle: WpImportBundle, options: ApplyOptions): Promise<ApplyReport>;
+/**
+ * Phase 21.3 — `wp-import` CLI runner.
+ *
+ * Returns an exit code instead of calling `process.exit` so the
+ * shim (or a test harness) can decide whether to terminate. The
+ * shim under `apps/web/scripts/wp-import.ts` does the actual exit.
+ *
+ * Today every invocation is a dry run because the applier doesn't
+ * exist yet (Phase 21.4 lands content writes; 21.5 lands media).
+ * `--dry-run` defaults to true; passing `--dry-run=false` prints a
+ * useful "not implemented yet" message rather than silently doing
+ * nothing.
+ */
+interface CliIo {
+    stdout: (line: string) => void;
+    stderr: (line: string) => void;
+}
+/**
+ * Hooks the shim plumbs in so the CLI can run the applier without
+ * directly depending on the framework's bootstrap. When omitted —
+ * e.g. CI runs `runCli` from a unit test — the CLI prints the
+ * dry-run summary and ignores `--apply`.
+ */
+interface CliApplyHooks {
+    applyBundle: (bundle: WpImportBundle, ctx: {
+        actor: NpAuthUser;
+        dryRun: boolean;
+        log: (line: string) => void;
+        /** Phase 21.8 — true when the operator passed `--no-create-authors`. */
+        createAuthors: boolean;
+        /**
+         * Phase 21.9 — operator-supplied custom-post-type mappings.
+         * Empty object when no `--config` file was passed.
+         */
+        collectionMappings: Record<string, CollectionMapping>;
+        /** Phase 21.12 — set when the operator passed `--strict`. */
+        strict: boolean;
+        /** Phase 21.12 — set when the operator passed `--update`. */
+        update: boolean;
+        /**
+         * Phase 21.12 — when set, the operator wants a side-by-side
+         * HTML/Lexical diff written somewhere. The CLI passes the
+         * resolved file path; the shim is responsible for opening
+         * the file and producing the deps. `null` means "don't emit".
+         */
+        reportHtmlPath: string | null;
+        /**
+         * Phase 21.14 — when set, the operator wants a resume marker
+         * loaded + persisted at the named path. `null` means
+         * "don't load or write a marker".
+         */
+        resumeStatePath: string | null;
+    }) => Promise<ApplyReport>;
+    resolveActor: () => Promise<NpAuthUser>;
+}
+declare function runCli(argv: string[], io?: CliIo, hooks?: CliApplyHooks): Promise<number>;
+/**
+ * Phase 21.3 — render a parsed `WpImportBundle` as a human-readable
+ * summary. Pure function (no IO) so unit tests can pin the exact
+ * shape of the operator-facing output.
+ *
+ * Format priorities, in order:
+ *   1. Confirm what the parser found (counts, site identity).
+ *   2. Surface anything an operator should act on before applying
+ *      (CPTs without mappings, attachments that need media work,
+ *      authors that will get auto-created in 21.8).
+ *   3. Stay narrow — no terminal-width assumptions, ASCII only.
+ *
+ * The output is intentionally line-by-line predictable so this
+ * doubles as a CI fixture in future sub-phases.
+ */
+declare function formatSummary(args: {
+    bundle: WpImportBundle;
+    sourcePath: string;
+    dryRun: boolean;
+}): string;
+/**
+ * Phase 21.4 — render the result of `applyBundle()`. Operator-
+ * facing summary printed after the apply pass finishes.
+ */
+declare function formatApplyReport(report: ApplyReport, args: {
+    dryRun: boolean;
+}): string;
+/**
+ * Phase 21.9 — parse + validate a wp-import config file.
+ *
+ * The config is a small JSON document. Example:
+ *
+ *   {
+ *     "mappings": [
+ *       { "wpType": "product", "collection": "products" },
+ *       {
+ *         "wpType": "event",
+ *         "collection": "events",
+ *         "fieldOverrides": { "_event_date": "eventDate" }
+ *       }
+ *     ]
+ *   }
+ *
+ * Unknown keys at the top level are ignored so future sub-phases
+ * (resume markers, plugin-supplied overrides) can extend the file
+ * without breaking older importers.
+ */
+interface WpImportConfig {
+    collectionMappings: Record<string, CollectionMapping>;
+}
+declare class WpImportConfigError extends Error {
+    constructor(message: string);
+}
+declare function loadConfigFromPath(path: string): WpImportConfig;
+declare function parseConfig(source: string, displayPath?: string): WpImportConfig;
+/**
+ * Phase 21.5 — fetch a media file from a WP source URL.
+ *
+ * The function is intentionally narrow:
+ *
+ *   - It only HTTP(S) GETs the URL the WXR pointed at; we don't
+ *     follow `srcset` or guess `-scaled` variants. WP often has
+ *     half a dozen size variants per image and re-deriving them
+ *     after upload is cheaper (Sharp pipeline runs server-side
+ *     anyway) than mirroring whatever the source happened to
+ *     pre-render.
+ *   - MIME is sniffed from the response `Content-Type` header,
+ *     falling back to `application/octet-stream` if absent.
+ *     The pipeline rejects anything not in the allow-list so a
+ *     server returning text/html (404 page, redirect intercept,
+ *     etc.) doesn't leak through.
+ *   - One retry on network/timeout failure. 4xx is terminal
+ *     (matches the design doc §6 — 404 is treated as a hard skip).
+ *
+ * SSRF guard (#270, #382):
+ *
+ *   - Scheme is restricted to http(s).
+ *   - The hostname is resolved via DNS and every returned address
+ *     is checked against private / loopback / link-local / CGNAT
+ *     / multicast / reserved CIDRs. Any private result rejects
+ *     the URL — we don't fall through to the public IPs because
+ *     a malicious DNS response can rebind between the check and
+ *     the fetch (TOCTOU). For the importer's purposes, hosting
+ *     media on a hostname that *also* has a private A record is
+ *     vanishingly rare; rejecting it is the safer default.
+ *   - The vetted address is then *pinned* on an undici `Agent` so
+ *     `fetch` connects to that exact IP instead of re-resolving
+ *     the hostname (#382). Without pinning, the preflight DNS
+ *     check and the connect-time DNS resolution are independent,
+ *     leaving a DNS-rebinding window where a public answer passes
+ *     the check and a private answer is what gets connected. SNI /
+ *     Host headers stay set to the original hostname so HTTPS cert
+ *     validation still works.
+ *   - Redirects are followed manually (`redirect: "manual"`),
+ *     capped at 3 hops, and each hop re-runs the DNS / private-IP
+ *     check AND re-pins the connect address. The platform `fetch`
+ *     would otherwise silently follow a public-IP 302 to
+ *     `169.254.169.254`.
+ *   - `Content-Length` is checked against `maxBytes` *before* the
+ *     body is read so a slow-read attacker can't tie the worker up
+ *     past the timeout window.
+ *
+ *   The `allowPrivateHosts` option exists for tests and for
+ *   self-hosted deployments where the WXR is genuinely on the
+ *   same private network as the importer.
+ */
+interface DownloadResult {
+    buffer: Buffer$1;
+    mimeType: string;
+    filename: string;
+}
+interface DownloadOptions {
+    /** Override `globalThis.fetch` — used by tests. */
+    fetchImpl?: typeof fetch;
+    /** Override DNS lookup — used by tests to drive private-IP rejection. */
+    dnsLookupImpl?: (hostname: string) => Promise<Array<{
+        address: string;
+        family: number;
+    }>>;
+    /** Per-request timeout in ms. Default 30s. */
+    timeoutMs?: number;
+    /** How many times to retry network/timeout failures before giving up. */
+    retries?: number;
+    /** Maximum redirect hops. Default 3. Each hop re-validates the host. */
+    maxRedirects?: number;
+    /** Maximum response size in bytes. Default 100 MiB. */
+    maxBytes?: number;
+    /**
+     * Skip the private-IP check. ONLY for tests and self-hosted
+     * deployments where the source server lives on the same
+     * private network as the importer.
+     */
+    allowPrivateHosts?: boolean;
+}
+declare class WpMediaDownloadError extends Error {
+    readonly url: string;
+    readonly status: number | null;
+    constructor(url: string, message: string, status?: number | null);
+}
+/**
+ * Thrown when the URL or any redirect target resolves to a host we
+ * refuse to talk to (private IP, loopback, non-HTTP scheme, etc).
+ * A separate subclass so `downloadMedia`'s retry loop can recognise
+ * it and refuse to retry — re-resolving DNS won't make a `127.0.0.1`
+ * AAAA record any safer.
+ */
+declare class WpMediaSsrfError extends WpMediaDownloadError {
+    constructor(url: string, message: string);
+}
+declare function downloadMedia(url: string, opts?: DownloadOptions): Promise<DownloadResult>;
+/**
+ * Resolve the SSRF-related download options from process env.
+ * `runMediaPipeline`'s default download wires this in so a
+ * self-hosted operator can opt into private-network imports
+ * without having to pass `deps.download` themselves.
+ *
+ *   - `NP_WP_IMPORT_ALLOW_PRIVATE_HOSTS=1` (or `true`) skips the
+ *     DNS / private-IP rejection step. ONLY use this when the
+ *     source WXR genuinely lives on the same trusted private
+ *     network as the importer.
+ *   - `NP_WP_IMPORT_MAX_BYTES=<int>` overrides the 100 MiB
+ *     body-size cap. Bumping this is the right knob for sites
+ *     with large video assets.
+ *
+ * Invalid values (non-numeric `MAX_BYTES`, etc.) fall back to
+ * the secure defaults silently — the goal is "don't refuse to
+ * boot," not "reward typos with bigger surface area."
+ */
+declare function resolveEnvDownloadOptions(env?: NodeJS.ProcessEnv): DownloadOptions;
+/**
+ * The framework's upload routes accept image/*, video/*, and
+ * application/pdf. Mirror that here so the importer doesn't push
+ * anything through that the upload route would reject.
+ */
+declare function isAllowedMimeType(mimeType: string): boolean;
+/**
+ * Phase 21.5 — replace WP source URLs in a Lexical document with
+ * NexPress media references.
+ *
+ * We patch two things on each `image` node:
+ *
+ *   1. `mediaId` — what `extractMediaIds` in
+ *      `packages/core/src/media/refs.ts` reads to wire `np_media_refs`
+ *      so the document gets blocked from referenced-media deletion.
+ *   2. `src` — what the SSR renderer at
+ *      `packages/editor/src/render-rich-text.tsx` looks at when
+ *      drawing the `<img>`. Setting both keeps the rendered HTML
+ *      stable while also routing through the framework's media
+ *      tracking. The applier sets `src` to the storage URL that the
+ *      media adapter eventually resolves; for now we leave it as the
+ *      WP source URL so themes that haven't migrated to media-id
+ *      resolution still render something.
+ *
+ * URLs that aren't in the resolution map (download 404, MIME
+ * rejected, etc.) are left untouched. Themes will render those as
+ * broken images — same outcome as the design doc §6 prescribes.
+ */
+declare function rewriteLexicalMedia(root: LexicalRoot, resolution: MediaResolution): LexicalRoot;
+export { type AppliedRow, type ApplyOptions, type ApplyReport, type AttachmentEntry, type AttachmentIndex, type AuditDeps, type AuthorResolution, type AuthorResolveInput, type AuthorResolver, type CliApplyHooks, type CliIo, type CollectionMapping, type CommentDeps, type CommentImportPlan, type CommentInsertInput, type DownloadOptions, type DownloadResult, type ImportedMemberInput, type LexicalRoot, type MediaPipelineDeps, type MediaPipelineError, type MediaPipelineOptions, type MediaPipelineReport, type MediaResolution, type MediaUploadInput, type ReportHtmlDeps, type ResumeDeps, type ResumeState, ResumeStateError, type SkippedRow, type TaxonomyKey, type TaxonomyResolution, type TaxonomyResolver, type WpAuthor, type WpComment, type WpImportBundle, type WpImportConfig, WpImportConfigError, type WpImportRecord, type WpImportStream, WpImportStreamError, type WpImportStreamHeader, type WpImportStreamOptions, WpMediaDownloadError, type WpMediaRef, WpMediaSsrfError, type WpPostStatus, type WpSiteInfo, type WpTerm, applyBundle, buildAttachmentIndex, documentKey, downloadMedia, emptyCommentPlan, emptyResumeState, formatApplyReport, formatSummary, htmlToLexical, importPostComments, isAllowedMimeType, loadConfigFromPath, loadResumeState, parseConfig, parseWxr, parseWxrStream, persistResumeState, pickPostTermIds, resolveAuthors, resolveEnvDownloadOptions, resolveTaxonomies, rewriteLexicalMedia, runCli, runMediaPipeline, termCacheKey };