npm - @stainless-api/docs - Versions diffs - 0.1.0-beta.63 → 0.1.0-beta.64 - Mend

@stainless-api/docs 0.1.0-beta.63 → 0.1.0-beta.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CHANGELOG.md +6 -0
package/package.json +3 -3
package/stl-docs/proseSearchIndexing.ts +245 -2

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # @stainless-api/docs
+## 0.1.0-beta.64
+### Patch Changes
+- add markdown indexing functionality
 ## 0.1.0-beta.63
 ### Patch Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@stainless-api/docs",
-  "version": "0.1.0-beta.63",
+  "version": "0.1.0-beta.64",
   "publishConfig": {
     "access": "public"
   },
@@ -52,9 +52,9 @@
     "vite-plugin-prebundle-workers": "^0.2.0",
     "web-worker": "^1.5.0",
     "yaml": "^2.8.2",
-    "@stainless-api/docs-ui": "0.1.0-beta.52",
     "@stainless-api/docs-search": "0.1.0-beta.4",
-    "@stainless-api/ui-primitives": "0.1.0-beta.39"
+    "@stainless-api/ui-primitives": "0.1.0-beta.39",
+    "@stainless-api/docs-ui": "0.1.0-beta.52"
   },
   "devDependencies": {
     "@astrojs/check": "^0.9.6",

package/stl-docs/proseSearchIndexing.ts CHANGED Viewed

@@ -5,7 +5,250 @@ import { bold } from '../shared/terminalUtils';
 import { buildProseIndex } from '@stainless-api/docs-search/providers/algolia';
 import * as cheerio from 'cheerio';
-function chunkByWords(content: string, chunkSize: number = 30000, chunkOverlap: number = 10) {
+interface ContentBlock {
+  type: 'header' | 'content';
+  tag?: string;
+  id?: string;
+  text: string;
+}
+// Chunking configuration
+// We target 64-256 tokens per chunk, using ~1.3 tokens/word for English text
+const TOKENS_PER_WORD = 1.3;
+const MIN_TOKENS = 64;
+const MAX_TOKENS = 256;
+const MIN_WORDS = Math.floor(MIN_TOKENS / TOKENS_PER_WORD); // ~49 words
+const MAX_WORDS = Math.floor(MAX_TOKENS / TOKENS_PER_WORD); // ~197 words
+const LINE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.75) / TOKENS_PER_WORD); // ~148 words
+const SENTENCE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.875) / TOKENS_PER_WORD); // ~172 words
+// Generate a URL-safe ID from header text (e.g., "OpenAPI Config" -> "openapi-config")
+function slugify(text: string): string {
+  return text
+    .toLowerCase()
+    .replace(/`/g, '') // Remove backticks
+    .replace(/[^a-z0-9]+/g, '-') // Replace non-alphanumeric with hyphens
+    .replace(/^-|-$/g, ''); // Trim leading/trailing hyphens
+}
+// Check if a word ends with a real table cell boundary (| but not escaped \|)
+function isTableCellBoundary(word: string): boolean {
+  return word.endsWith('|') && !word.endsWith('\\|');
+}
+/**
+ * Chunks content blocks into segments of 64-256 tokens.
+ *
+ * Chunking strategy:
+ * 1. Break at headers if chunk has >= MIN_WORDS, otherwise merge with next section
+ * 2. Prefer breaking at line/table boundaries after LINE_BREAK_WORDS (~148 words / ~192 tokens)
+ * 3. Break at sentence endings after SENTENCE_BREAK_WORDS (~172 words / ~224 tokens)
+ * 4. Force break at MAX_WORDS, preferring table row boundaries if available
+ * 5. Header context (id/tag) is preserved for continuation chunks
+ */
+function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: string; headerTag?: string }[] {
+  const chunks: { content: string; headerId?: string; headerTag?: string }[] = [];
+  let currentChunk: string[] = [];
+  let currentHeaderId: string | undefined;
+  let currentHeaderTag: string | undefined;
+  // Flush current chunk to output. If splitAt is provided, keep words after that index for next chunk.
+  const flushChunk = (splitAt?: number) => {
+    if (currentChunk.length === 0) return;
+    const wordsToFlush = splitAt !== undefined ? currentChunk.slice(0, splitAt) : currentChunk;
+    const wordsToKeep = splitAt !== undefined ? currentChunk.slice(splitAt) : [];
+    if (wordsToFlush.length > 0) {
+      chunks.push({
+        content: wordsToFlush.join(' ').trim(),
+        headerId: currentHeaderId,
+        headerTag: currentHeaderTag,
+      });
+    }
+    currentChunk = wordsToKeep;
+  };
+  // Find a table row boundary to break at (between MIN_WORDS and current length)
+  // Returns the index to split at, or undefined if no good boundary found
+  const findTableRowBoundary = (): number | undefined => {
+    for (let i = currentChunk.length - 1; i >= MIN_WORDS; i--) {
+      const word = currentChunk[i]!;
+      const nextWord = currentChunk[i + 1];
+      // A row boundary is where one cell ends (|) and the next row starts (|)
+      if (isTableCellBoundary(word) && nextWord?.startsWith('|')) {
+        return i + 1;
+      }
+    }
+    return undefined;
+  };
+  for (const block of blocks) {
+    if (block.type === 'header') {
+      // Flush at header boundaries only if chunk meets minimum size
+      // This avoids creating tiny chunks for headers with little content
+      if (currentChunk.length >= MIN_WORDS) {
+        flushChunk();
+      }
+      currentHeaderId = block.id;
+      currentHeaderTag = block.tag;
+      // Include header text at the start of the new chunk
+      currentChunk.push(...block.text.split(/\s+/).filter((w) => w.length > 0));
+      continue;
+    }
+    // Split by newlines first to preserve line boundary information
+    const lines = block.text.split(/\n/);
+    let inCodeBlock = false;
+    for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
+      const line = lines[lineIdx]!;
+      // Track code block boundaries
+      if (/^(`{3,}|~{3,})/.test(line.trim())) {
+        inCodeBlock = !inCodeBlock;
+      }
+      // Calculate indentation level (number of leading spaces, treating tabs as 2 spaces)
+      const indentMatch = line.match(/^(\s*)/);
+      const indentLevel = indentMatch ? indentMatch[1]!.replace(/\t/g, '  ').length : 0;
+      const words = line.split(/\s+/).filter((w) => w.length > 0);
+      const isLastLine = lineIdx === lines.length - 1;
+      for (let wordIdx = 0; wordIdx < words.length; wordIdx++) {
+        const word = words[wordIdx]!;
+        const isEndOfLine = wordIdx === words.length - 1 && !isLastLine;
+        if (currentChunk.length >= MAX_WORDS) {
+          flushChunk(findTableRowBoundary());
+        }
+        currentChunk.push(word);
+        // In code blocks, avoid early flushes to keep blocks together
+        // - Light indentation (2+ spaces): require more words before flushing
+        // - Deep indentation (4+ spaces): skip early flushes entirely
+        const inShallowCode = inCodeBlock && indentLevel >= 2 && indentLevel < 4;
+        const inDeepCode = inCodeBlock && indentLevel >= 4;
+        // Flush early at natural break points
+        const len = currentChunk.length;
+        const atTableBreak = len >= LINE_BREAK_WORDS && isTableCellBoundary(word);
+        // Shallow code: only flush at sentence threshold; Deep code: don't flush early
+        const lineBreakThreshold = inShallowCode ? SENTENCE_BREAK_WORDS : LINE_BREAK_WORDS;
+        const atLineBreak = len >= lineBreakThreshold && isEndOfLine && !inDeepCode;
+        const atSentenceBreak = len >= SENTENCE_BREAK_WORDS && /[.!?]["']?$/.test(word) && !inDeepCode;
+        if (atTableBreak || atLineBreak || atSentenceBreak) {
+          flushChunk();
+        }
+      }
+    }
+  }
+  flushChunk();
+  return chunks;
+}
+/**
+ * Parses markdown into content blocks, identifying headers and content sections.
+ * Tracks fenced code blocks to avoid treating # comments in code as headers.
+ */
+function parseMarkdown(markdown: string): ContentBlock[] {
+  const blocks: ContentBlock[] = [];
+  // Extract title from frontmatter and treat it as h1
+  const frontmatterMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
+  if (frontmatterMatch) {
+    const frontmatter = frontmatterMatch[1]!;
+    const titleMatch = frontmatter.match(/^title:\s*(.+)$/m);
+    if (titleMatch) {
+      const title = titleMatch[1]!.trim().replace(/^["']|["']$/g, ''); // Remove quotes if present
+      blocks.push({
+        type: 'header',
+        tag: 'h1',
+        id: slugify(title),
+        text: title,
+      });
+    }
+  }
+  // Remove frontmatter
+  const content = markdown.replace(/^---[\s\S]*?---\n*/, '').trim();
+  // Split into lines and process
+  const lines = content.split('\n');
+  let currentContent: string[] = [];
+  let inCodeBlock = false;
+  const flushContent = () => {
+    const text = currentContent.join('\n').trim();
+    if (text) {
+      blocks.push({ type: 'content', text });
+    }
+    currentContent = [];
+  };
+  for (const line of lines) {
+    // Track fenced code blocks (``` or ~~~)
+    // Only match standalone markers: ```[language] with nothing else on the line
+    // This avoids matching inline code blocks in table cells like "``` Then content..."
+    if (/^(`{3,}|~{3,})([a-zA-Z0-9]*)?(\s*)$/.test(line)) {
+      inCodeBlock = !inCodeBlock;
+      currentContent.push(line);
+      continue;
+    }
+    // Only match headers outside of code blocks
+    if (!inCodeBlock) {
+      const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
+      if (headerMatch) {
+        flushContent();
+        const level = headerMatch[1]!.length;
+        const headerText = headerMatch[2]!.trim();
+        blocks.push({
+          type: 'header',
+          tag: `h${level}`,
+          id: slugify(headerText),
+          text: headerText,
+        });
+        continue;
+      }
+    }
+    currentContent.push(line);
+  }
+  flushContent();
+  return blocks;
+}
+/**
+ * Extracts and chunks markdown content for search indexing.
+ * Yields chunk objects with content, header context, and chunk metadata.
+ */
+export function* indexMarkdown(markdown: string) {
+  const blocks = parseMarkdown(markdown);
+  const chunks = chunkByWords(blocks);
+  const documentId = crypto.randomUUID();
+  for (const [index, chunk] of chunks.entries()) {
+    yield {
+      id: chunk.headerId ?? '',
+      tag: chunk.headerTag ?? '',
+      content: chunk.content,
+      chunk: {
+        id: documentId,
+        index,
+        total: chunks.length,
+      },
+    };
+  }
+}
+function chunkHTMLByWords(content: string, chunkSize: number = 30000, chunkOverlap: number = 10) {
   if (Buffer.byteLength(content) < chunkSize) return [content];
   const words = content.split(/\s+/);
@@ -42,7 +285,7 @@ export function* indexHTML(content: string, root: string, pattern: string) {
   for (const match of matches) {
     const rawText = $(match).text().trim();
-    const chunks = chunkByWords(rawText);
+    const chunks = chunkHTMLByWords(rawText);
     const chunkId = crypto.randomUUID();
     for (const [chunkN, content] of chunks.entries()) {