npm - @mdream/js - Versions diffs - 0.17.0 - Mend

@mdream/js 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/LICENSE.md +9 -0
package/README.md +135 -0
package/bin/mdream.mjs +2 -0
package/dist/_chunks/const.mjs +137 -0
package/dist/_chunks/index.d.mts +14 -0
package/dist/_chunks/minimal.d.mts +10 -0
package/dist/_chunks/parse.mjs +1201 -0
package/dist/_chunks/plugins.mjs +791 -0
package/dist/_chunks/resolve-plugins.mjs +302 -0
package/dist/_chunks/src.mjs +344 -0
package/dist/_chunks/types.d.mts +390 -0
package/dist/cli.d.mts +1 -0
package/dist/cli.mjs +27 -0
package/dist/index.d.mts +4 -0
package/dist/index.mjs +7 -0
package/dist/llms-txt.d.mts +89 -0
package/dist/llms-txt.mjs +347 -0
package/dist/negotiate.d.mts +26 -0
package/dist/negotiate.mjs +92 -0
package/dist/parse.d.mts +57 -0
package/dist/parse.mjs +3 -0
package/dist/plugins.d.mts +93 -0
package/dist/plugins.mjs +3 -0
package/dist/preset/minimal.d.mts +2 -0
package/dist/preset/minimal.mjs +34 -0
package/dist/splitter.d.mts +21 -0
package/dist/splitter.mjs +215 -0
package/package.json +93 -0

package/dist/_chunks/types.d.mts ADDED Viewed

@@ -0,0 +1,390 @@
+//#region src/const.d.ts
+declare const TAG_H1 = 7;
+declare const TAG_H2 = 8;
+declare const TAG_H3 = 9;
+declare const TAG_H4 = 10;
+declare const TAG_H5 = 11;
+declare const TAG_H6 = 12;
+declare const ELEMENT_NODE = 1;
+declare const TEXT_NODE = 2;
+declare const NodeEventEnter$1 = 0;
+declare const NodeEventExit$1 = 1;
+//#endregion
+//#region src/types.d.ts
+/**
+ * Imperative hook-based transform plugins. **JavaScript engine only.**
+ * When transforms are provided, the JS engine is used regardless of engine selection.
+ * For declarative config that works with both engines, use `BuiltinPlugins`.
+ */
+interface TransformPlugin {
+  /**
+   * Process a node before it's handled by the parser
+   */
+  beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
+    skip: boolean;
+  };
+  /**
+   * Hook that runs when entering a node
+   * @returns String to add to the output, or PluginHookResult with content
+   */
+  onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
+  /**
+   * Hook that runs when exiting a node
+   */
+  onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
+  /**
+   * Process attributes for a node
+   */
+  processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
+  /**
+   * Process a text node before it's added to the output
+   * @returns Result with content and skip flag, or undefined for no transformation
+   */
+  processTextNode?: (node: TextNode, state: MdreamRuntimeState) => {
+    content: string;
+    skip: boolean;
+  } | undefined;
+}
+/**
+ * Declarative tag override configuration.
+ * When a string value is provided, it acts as an alias (e.g. `{ "x-heading": "h2" }`).
+ */
+interface TagOverride {
+  enter?: string;
+  exit?: string;
+  spacing?: [number, number];
+  isInline?: boolean;
+  isSelfClosing?: boolean;
+  collapsesInnerWhiteSpace?: boolean;
+}
+/**
+ * Frontmatter configuration options.
+ */
+interface FrontmatterConfig {
+  additionalFields?: Record<string, string>;
+  metaFields?: string[];
+  /**
+   * Callback to receive structured frontmatter data.
+   * Called after conversion with the extracted key-value pairs.
+   */
+  onExtract?: (frontmatter: Record<string, string>) => void;
+}
+/**
+ * Declarative configuration for built-in plugins.
+ * Works with both the JavaScript and Rust engines.
+ */
+interface BuiltinPlugins {
+  /** Filter elements by CSS selectors, tag names, or TAG_* constants */
+  filter?: {
+    include?: (string | number)[];
+    exclude?: (string | number)[];
+    processChildren?: boolean;
+  };
+  /**
+   * Extract frontmatter from HTML head.
+   * - `true`: enable with defaults
+   * - `(fm) => void`: enable and receive structured data via callback
+   * - `FrontmatterConfig`: enable with config options and optional callback
+   */
+  frontmatter?: boolean | ((frontmatter: Record<string, string>) => void) | FrontmatterConfig;
+  /** Isolate main content area */
+  isolateMain?: boolean;
+  /** Convert Tailwind utility classes to markdown formatting */
+  tailwind?: boolean;
+  /**
+   * Extract elements matching CSS selectors during conversion.
+   * Each key is a CSS selector; the handler is called for every match.
+   */
+  extraction?: Record<string, (element: ExtractedElement) => void>;
+  /**
+   * Declarative tag overrides for customizing tag behavior.
+   * String values act as aliases (e.g. `{ "x-heading": "h2" }` makes `<x-heading>` behave like `<h2>`).
+   * Object values override specific handler properties.
+   */
+  tagOverrides?: Record<string, TagOverride | string>;
+}
+/**
+ * Shared engine options that work with both JS and Rust engines.
+ * This is the contract that `MarkdownEngine` methods accept.
+ */
+interface CleanOptions {
+  /** Strip tracking query parameters (utm_*, fbclid, gclid, etc.) from URLs */
+  urls?: boolean;
+  /** Strip fragment-only links that don't match any heading in the output */
+  fragments?: boolean;
+  /** Strip links with meaningless hrefs (#, javascript:void(0)) → plain text */
+  emptyLinks?: boolean;
+  /** Collapse 3+ consecutive blank lines to 2 */
+  blankLines?: boolean;
+  /** Strip links where text equals URL: [https://x.com](https://x.com) → https://x.com */
+  redundantLinks?: boolean;
+  /** Strip self-referencing heading anchors: ## [Title](#title) → ## Title */
+  selfLinkHeadings?: boolean;
+  /** Strip images with no alt text (decorative/tracking pixels) */
+  emptyImages?: boolean;
+  /** Drop links that produce no visible text: [](url) → nothing */
+  emptyLinkText?: boolean;
+}
+interface EngineOptions {
+  /**
+   * Origin URL for resolving relative image paths and internal links.
+   */
+  origin?: string;
+  /**
+   * Declarative built-in plugin config. Works with both JS and Rust engines.
+   */
+  plugins?: BuiltinPlugins;
+  /**
+   * Clean up the markdown output. Pass `true` for all cleanup or an object
+   * to enable specific features. Operates as a post-processing step on the
+   * final markdown (sync API only for `fragments`).
+   */
+  clean?: boolean | CleanOptions;
+}
+interface ElementNode extends Node {
+  /** Element tag name (for ELEMENT_NODE) */
+  name: string;
+  /** HTML attributes (for ELEMENT_NODE) */
+  attributes: Record<string, string>;
+  /** Custom data added by plugins */
+  context?: PluginContext;
+  /** ID of the tag for fast handler lookup */
+  tagId?: number;
+  /** Map of tag names to their nesting count (using Uint8Array for performance) */
+  depthMap: Uint8Array;
+  /** Plugin outputs collected during processing */
+  pluginOutput?: string[];
+}
+interface TextNode extends Node {
+  /** Text content (for TEXT_NODE) */
+  value: string;
+  /** Custom data added by plugins */
+  context?: PluginContext;
+  /** Whether this text node should be excluded from markdown output (for script/style elements) */
+  excludedFromMarkdown?: boolean;
+}
+/**
+ * Base DOM node interface
+ * Optimized for streaming HTML parsing with minimal memory footprint
+ */
+interface Node {
+  /** Node type (ELEMENT_NODE or TEXT_NODE) */
+  type: number;
+  /** Current nesting depth in the DOM tree */
+  depth: number;
+  /** Node exclusion and filtering now handled by plugins */
+  /** Index of this node within its parent's children */
+  index: number;
+  /** Current walk index for child traversal during streaming */
+  currentWalkIndex?: number;
+  /** Count of text child nodes - used for whitespace handling */
+  childTextNodeIndex?: number;
+  /** Whether node contains whitespace - used for whitespace optimization */
+  containsWhitespace?: boolean;
+  /** Cached reference to tag handler for performance */
+  tagHandler?: TagHandler;
+  /** Parent node */
+  parent?: ElementNode | null;
+  /** Custom data added by plugins */
+  context?: PluginContext;
+}
+/**
+ * State interface for HTML parsing and processing
+ * Contains parsing state that's maintained during HTML traversal
+ */
+interface MdreamProcessingState {
+  /** Map of tag names to their current nesting depth - uses TypedArray for performance */
+  depthMap: Uint8Array;
+  /** Current overall nesting depth */
+  depth: number;
+  /** Currently processing element node */
+  currentNode?: ElementNode | null;
+  /** Node filtering and exclusion is now handled by plugins */
+  /** Whether current content contains HTML entities that need decoding */
+  hasEncodedHtmlEntity?: boolean;
+  /** Whether the last processed character was whitespace - for collapsing whitespace */
+  lastCharWasWhitespace?: boolean;
+  /** Whether the last processed buffer has whitespace - optimization flag */
+  textBufferContainsWhitespace?: boolean;
+  /** Whether the last processed buffer contains non-whitespace characters */
+  textBufferContainsNonWhitespace?: boolean;
+  /** Whether a tag was just closed - affects whitespace handling */
+  justClosedTag?: boolean;
+  /** Whether the next text node is the first in its element - for whitespace trimming */
+  isFirstTextInElement?: boolean;
+  /** Reference to the last processed text node - for context tracking */
+  lastTextNode?: Node;
+  /** Quote state tracking for non-nesting tags - avoids backward scanning */
+  inSingleQuote?: boolean;
+  inDoubleQuote?: boolean;
+  inBacktick?: boolean;
+  /** Backslash escaping state tracking - avoids checking previous character */
+  lastCharWasBackslash?: boolean;
+  /** Resolved plugin instances for efficient iteration */
+  resolvedPlugins?: TransformPlugin[];
+  /** Configuration options for conversion */
+  options?: EngineOptions;
+}
+/**
+ * Runtime state for markdown generation
+ * Extended state that includes output tracking and options
+ */
+interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
+  /** Number of newlines at end of most recent output */
+  lastNewLines?: number;
+  /** Configuration options for conversion */
+  options?: EngineOptions;
+  /** Table processing state - specialized for Markdown tables */
+  tableRenderedTable?: boolean;
+  tableCurrentRowCells?: number;
+  tableColumnAlignments?: string[];
+  /** Resolved plugin instances for efficient iteration */
+  resolvedPlugins?: TransformPlugin[];
+  /** Content buffer for markdown output */
+  buffer: string[];
+  /** Performance cache for last content to avoid iteration */
+  lastContentCache?: string;
+  /** Reference to the last processed node */
+  lastNode?: Node;
+  context?: PluginContext;
+}
+type NodeEventEnter = 0;
+type NodeEventExit = 1;
+/**
+ * Node event for DOM traversal
+ * Used in the event-based traversal system for streaming processing
+ */
+interface NodeEvent {
+  /** Event type - enter (start tag) or exit (end tag) */
+  type: NodeEventEnter | NodeEventExit;
+  /** The node being processed */
+  node: Node;
+}
+/**
+ * Handler context for markdown conversion
+ * Passed to tag handler functions for converting specific elements
+ */
+interface HandlerContext {
+  /** Current node being processed */
+  node: ElementNode;
+  /** Parent node (if any) */
+  parent?: ElementNode;
+  /** Runtime state */
+  state: MdreamRuntimeState;
+}
+/**
+ * Tag handler interface for HTML elements
+ * Used by plugins to extend or customize tag handling
+ */
+interface TagHandler {
+  enter?: (context: HandlerContext) => string | undefined | void;
+  exit?: (context: HandlerContext) => string | undefined | void;
+  isSelfClosing?: boolean;
+  isNonNesting?: boolean;
+  collapsesInnerWhiteSpace?: boolean;
+  isInline?: boolean;
+  spacing?: readonly [number, number];
+  excludesTextNodes?: boolean;
+}
+interface TailwindContext {
+  hidden?: boolean;
+  prefix?: string;
+  suffix?: string;
+}
+interface PluginContext {
+  score?: number;
+  tagCount?: number;
+  linkTextLength?: number;
+  textLength?: number;
+  isHighLinkDensity?: boolean;
+  tailwind?: TailwindContext;
+  [key: string]: unknown;
+}
+/**
+ * Element extracted during conversion by the extraction plugin.
+ */
+interface ExtractedElement {
+  /** The CSS selector that matched this element */
+  selector: string;
+  /** The HTML tag name */
+  tagName: string;
+  /** Accumulated text content of the element */
+  textContent: string;
+  /** HTML attributes of the element */
+  attributes: Record<string, string>;
+}
+/**
+ * Top-level options for the mdream JS engine.
+ * Extends the shared `EngineOptions` with JS-specific concerns.
+ */
+interface MdreamOptions extends EngineOptions {
+  /**
+   * Imperative hook-based transform plugins.
+   * When provided, a new JS engine is created with these hooks.
+   */
+  hooks?: TransformPlugin[];
+}
+/**
+ * Markdown chunk with content and metadata
+ * Compatible with LangChain Document structure
+ */
+interface MarkdownChunk {
+  /** The markdown content of the chunk */
+  content: string;
+  /** Metadata extracted during chunking */
+  metadata: {
+    /** Header hierarchy at this chunk position */headers?: Record<string, string>; /** Code block language if chunk is/contains code */
+    code?: string; /** Line number range in original document */
+    loc?: {
+      lines: {
+        from: number;
+        to: number;
+      };
+    };
+  };
+}
+/**
+ * Options for HTML to Markdown chunking
+ * Extends EngineOptions with chunking-specific settings
+ */
+interface SplitterOptions extends EngineOptions {
+  /**
+   * Header tag IDs to split on (TAG_H1, TAG_H2, etc.)
+   * @example [TAG_H1, TAG_H2]
+   * @default [TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6]
+   */
+  headersToSplitOn?: number[];
+  /**
+   * Return each line as individual chunk
+   * @default false
+   */
+  returnEachLine?: boolean;
+  /**
+   * Strip headers from chunk content
+   * @default true
+   */
+  stripHeaders?: boolean;
+  /**
+   * Maximum chunk size
+   * @default 1000
+   */
+  chunkSize?: number;
+  /**
+   * Overlap between chunks for context preservation
+   * @default 200
+   */
+  chunkOverlap?: number;
+  /**
+   * Function to measure chunk length (default: character count)
+   * Can be replaced with token counter for LLM applications
+   * @default (text) => text.length
+   */
+  lengthFunction?: (text: string) => number;
+  /**
+   * Keep separators in the split chunks
+   * @default false
+   */
+  keepSeparator?: boolean;
+}
+//#endregion
+export { TAG_H3 as C, TEXT_NODE as D, TAG_H6 as E, TAG_H2 as S, TAG_H5 as T, TransformPlugin as _, ExtractedElement as a, NodeEventExit$1 as b, MdreamOptions as c, NodeEvent as d, PluginContext as f, TextNode as g, TagOverride as h, EngineOptions as i, MdreamRuntimeState as l, TagHandler as m, CleanOptions as n, FrontmatterConfig as o, SplitterOptions as p, ElementNode as r, MarkdownChunk as s, BuiltinPlugins as t, Node as u, ELEMENT_NODE as v, TAG_H4 as w, TAG_H1 as x, NodeEventEnter$1 as y };

package/dist/cli.d.mts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { };

package/dist/cli.mjs ADDED Viewed

@@ -0,0 +1,27 @@
+import { n as streamHtmlToMarkdown } from "./_chunks/src.mjs";
+import "./_chunks/const.mjs";
+import "./_chunks/parse.mjs";
+import "./_chunks/resolve-plugins.mjs";
+import "./_chunks/plugins.mjs";
+import { withMinimalPreset } from "./preset/minimal.mjs";
+import { readFileSync } from "node:fs";
+import { Readable } from "node:stream";
+import { fileURLToPath } from "node:url";
+import { cac } from "cac";
+import { dirname, join } from "pathe";
+//#region src/cli.ts
+async function streamingConvert(options = {}) {
+	let conversionOptions = { origin: options.origin };
+	if (options.preset === "minimal") conversionOptions = withMinimalPreset(conversionOptions);
+	const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
+	for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) process.stdout.write(markdownChunk);
+}
+const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
+const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
+const cli = cac();
+cli.command("[options]", "Convert HTML from stdin to Markdown on stdout (JS engine)").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
+	await streamingConvert(opts);
+});
+cli.help().version(packageJson.version).parse();
+//#endregion
+export {};

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,4 @@
+import { C as TAG_H3, D as TEXT_NODE, E as TAG_H6, S as TAG_H2, T as TAG_H5, _ as TransformPlugin, a as ExtractedElement, b as NodeEventExit, c as MdreamOptions, d as NodeEvent, f as PluginContext, g as TextNode, h as TagOverride, i as EngineOptions, n as CleanOptions, o as FrontmatterConfig, p as SplitterOptions, r as ElementNode, s as MarkdownChunk, t as BuiltinPlugins, u as Node, v as ELEMENT_NODE, w as TAG_H4, x as TAG_H1, y as NodeEventEnter } from "./_chunks/types.mjs";
+import { n as streamHtmlToMarkdown, r as createPlugin, t as htmlToMarkdown } from "./_chunks/index.mjs";
+import { t as withMinimalPreset } from "./_chunks/minimal.mjs";
+export { BuiltinPlugins, CleanOptions, ELEMENT_NODE, ElementNode, EngineOptions, ExtractedElement, FrontmatterConfig, MarkdownChunk, MdreamOptions, Node, NodeEvent, NodeEventEnter, NodeEventExit, PluginContext, SplitterOptions, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TEXT_NODE, TagOverride, TextNode, TransformPlugin, createPlugin, htmlToMarkdown, streamHtmlToMarkdown, withMinimalPreset };

package/dist/index.mjs ADDED Viewed

@@ -0,0 +1,7 @@
+import { n as streamHtmlToMarkdown, t as htmlToMarkdown } from "./_chunks/src.mjs";
+import { c as NodeEventExit, d as TAG_H2, f as TAG_H3, g as TEXT_NODE, h as TAG_H6, m as TAG_H5, p as TAG_H4, r as ELEMENT_NODE, s as NodeEventEnter, u as TAG_H1 } from "./_chunks/const.mjs";
+import "./_chunks/parse.mjs";
+import "./_chunks/resolve-plugins.mjs";
+import { s as createPlugin } from "./_chunks/plugins.mjs";
+import { withMinimalPreset } from "./preset/minimal.mjs";
+export { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TEXT_NODE, createPlugin, htmlToMarkdown, streamHtmlToMarkdown, withMinimalPreset };

package/dist/llms-txt.d.mts ADDED Viewed

@@ -0,0 +1,89 @@
+//#region src/llms-txt.d.ts
+/**
+ * Link in llms.txt section
+ */
+interface LlmsTxtLink {
+  /** The title of the link */
+  title: string;
+  /** The description of the link */
+  description?: string;
+  /** The href of the link */
+  href: string;
+}
+/**
+ * Section in llms.txt
+ */
+interface LlmsTxtSection {
+  /** The title of the section */
+  title: string;
+  /** The description of the section (can be array for multiple paragraphs) */
+  description?: string | string[];
+  /** The links of the section */
+  links?: LlmsTxtLink[];
+}
+interface LlmsTxtArtifactsOptions {
+  files: ProcessedFile[];
+  siteName?: string;
+  description?: string;
+  origin?: string;
+  generateFull?: boolean;
+  generateMarkdown?: boolean;
+  outputDir?: string;
+  /** The sections to write before pages */
+  sections?: LlmsTxtSection[];
+  /** Notes to write at the end */
+  notes?: string | string[];
+}
+interface ProcessedFile {
+  filePath?: string;
+  title: string;
+  content: string;
+  url: string;
+  metadata?: {
+    title?: string;
+    description?: string;
+    keywords?: string;
+    author?: string;
+  };
+}
+interface LlmsTxtArtifactsResult {
+  llmsTxt: string;
+  llmsFullTxt?: string;
+  markdownFiles?: {
+    path: string;
+    content: string;
+  }[];
+  processedFiles: ProcessedFile[];
+}
+/**
+ * Main function to generate llms.txt artifacts from pre-processed files
+ */
+declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
+/**
+ * Options for creating an llms.txt stream
+ */
+interface CreateLlmsTxtStreamOptions extends Omit<LlmsTxtArtifactsOptions, 'files' | 'generateMarkdown'> {
+  /** Directory to write files to (defaults to process.cwd()) */
+  outputDir?: string;
+  /** Site name for the header (defaults to 'Site') */
+  siteName?: string;
+  /** Site description for the header */
+  description?: string;
+  /** Origin URL to prepend to relative URLs */
+  origin?: string;
+  /** Generate llms-full.txt with complete page content (defaults to false) */
+  generateFull?: boolean;
+  /** The sections to write before pages */
+  sections?: LlmsTxtSection[];
+  /** Notes to write at the end */
+  notes?: string | string[];
+}
+/**
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk.
+ *
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
+ * never keeping full content in memory. Creates outputDir recursively if needed.
+ */
+declare function createLlmsTxtStream(options: CreateLlmsTxtStreamOptions): WritableStream<ProcessedFile>;
+//#endregion
+export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, LlmsTxtLink, LlmsTxtSection, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };