@stainless-api/docs 0.1.0-beta.89 → 0.1.0-beta.90

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  # @stainless-api/docs
2
2
 
3
+ ## 0.1.0-beta.90
4
+
5
+ ### Minor Changes
6
+
7
+ - b8f1f3c: improve prose chunking
8
+
9
+ ### Patch Changes
10
+
11
+ - 3de4232: support additional extra custom fonts
12
+ - c77e607: fixes issue where sdk select wasnt showing on readme pages
13
+
3
14
  ## 0.1.0-beta.89
4
15
 
5
16
  ### Minor Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@stainless-api/docs",
3
- "version": "0.1.0-beta.89",
3
+ "version": "0.1.0-beta.90",
4
4
  "publishConfig": {
5
5
  "access": "public"
6
6
  },
@@ -51,7 +51,7 @@
51
51
  "remark-gfm": "^4.0.1",
52
52
  "remark-github-alerts": "^0.1.1",
53
53
  "remark-stringify": "^11.0.0",
54
- "shiki": "^3.21.0",
54
+ "shiki": "^3.22.0",
55
55
  "unified": "^11.0.5",
56
56
  "vite-plugin-prebundle-workers": "^0.2.0",
57
57
  "web-worker": "^1.5.0",
@@ -64,10 +64,10 @@
64
64
  "@astrojs/check": "^0.9.6",
65
65
  "@markdoc/markdoc": "^0.5.4",
66
66
  "@types/node": "24.10.9",
67
- "@types/react": "19.2.7",
67
+ "@types/react": "19.2.10",
68
68
  "@types/react-dom": "^19.2.3",
69
- "react": "^19.2.3",
70
- "react-dom": "^19.2.3",
69
+ "react": "^19.2.4",
70
+ "react-dom": "^19.2.4",
71
71
  "tsx": "^4.21.0",
72
72
  "typescript": "5.9.3",
73
73
  "vite": "^6.4.1",
@@ -1,10 +1,6 @@
1
1
  ---
2
2
  import { parseRoute } from '@stainless-api/docs-ui/routing';
3
- import {
4
- RESOLVED_API_REFERENCE_PATH,
5
- DEFAULT_LANGUAGE,
6
- EXCLUDE_LANGUAGES,
7
- } from 'virtual:stl-starlight-virtual-module';
3
+ import { DEFAULT_LANGUAGE, EXCLUDE_LANGUAGES } from 'virtual:stl-starlight-virtual-module';
8
4
  import { Languages } from '../languages';
9
5
  import { SDKSelectReactComponent } from '../react/Routing';
10
6
  import { getSDKJSONInSSR } from '../specs/fetchSpecSSR';
@@ -32,8 +28,7 @@ const options = getDocsLanguages(spec, EXCLUDE_LANGUAGES).map((value) => ({
32
28
  selected: data.language === value,
33
29
  }));
34
30
 
35
- const readmeSlug =
36
- language === 'http' ? RESOLVED_API_REFERENCE_PATH : `${RESOLVED_API_REFERENCE_PATH}/${language}`;
31
+ const readmeSlug = language === 'http' ? API_REFERENCE_BASE_PATH : `${API_REFERENCE_BASE_PATH}/${language}`;
37
32
  ---
38
33
 
39
34
  <span
@@ -21,9 +21,6 @@ function markCurrentItems(sidebar: SidebarEntry[], currentSlug: string) {
21
21
  for (const entry of entries) {
22
22
  if (entry.type === 'link') {
23
23
  entry.isCurrent = removeTrailingSlash(entry.href) === normalizedCurrentSlug;
24
- if (entry.isCurrent) {
25
- return;
26
- }
27
24
  }
28
25
  if (entry.type === 'group') {
29
26
  recursiveMarkCurrent(entry.entries);
@@ -5,11 +5,13 @@ import Default from '@astrojs/starlight/components/Head.astro';
5
5
  import path from 'path';
6
6
 
7
7
  const mdPath = path.posix.join(Astro.url.pathname, 'index.md');
8
+ const fonts = [FONTS.primary, FONTS.heading, FONTS.mono, ...(FONTS.additional ?? [])].filter(Boolean);
8
9
  ---
9
10
 
10
11
  <Default />
11
12
 
12
- {Object.values(FONTS).map((font) => <Font cssVariable={font.cssVariable} preload={font.preload} />)}
13
+ {fonts.map((font) => <Font cssVariable={font.cssVariable} preload={font.preload} />)}
14
+
13
15
  <link rel="alternate" type="text/markdown" href={mdPath} />
14
16
 
15
17
  <script>
package/stl-docs/fonts.ts CHANGED
@@ -6,16 +6,19 @@ import type { FontPreloadFilter } from 'astro:assets';
6
6
  type AstroFontConfigEntry = Defined<AstroConfig['experimental']['fonts']>[number];
7
7
 
8
8
  // Apply Omit to each member of the union while preserving union structure
9
+ type PreloadFilter = { preload?: FontPreloadFilter };
9
10
  export type StlDocsFontConfigEntry = (AstroFontConfigEntry extends infer T
10
11
  ? T extends unknown
11
12
  ? Omit<T, 'cssVariable'>
12
13
  : never
13
- : never) & { preload?: FontPreloadFilter };
14
+ : never) &
15
+ PreloadFilter;
14
16
 
15
17
  export type StlDocsFontConfig = {
16
18
  primary?: StlDocsFontConfigEntry;
17
19
  heading?: StlDocsFontConfigEntry;
18
20
  mono?: StlDocsFontConfigEntry;
21
+ additional?: (AstroFontConfigEntry & PreloadFilter)[];
19
22
  };
20
23
  const latinFeatureSettings = "'ss01' on, 'ss03' on, 'ss04' on, 'ss06' on, 'ss08' on";
21
24
  /* prettier-ignore */
@@ -31,6 +34,7 @@ export function getFontRoles(fonts: StlDocsFontConfig | undefined) {
31
34
  primary?: { cssVariable: string; preload?: FontPreloadFilter };
32
35
  heading?: { cssVariable: string; preload?: FontPreloadFilter };
33
36
  mono?: { cssVariable: string; preload?: FontPreloadFilter };
37
+ additional?: { cssVariable: string; preload?: FontPreloadFilter }[];
34
38
  } = {};
35
39
  if (fonts.primary) {
36
40
  fontConfigs['primary'] = {
@@ -50,6 +54,12 @@ export function getFontRoles(fonts: StlDocsFontConfig | undefined) {
50
54
  preload: fonts.mono.preload ?? [{ style: 'normal' }],
51
55
  };
52
56
  }
57
+ if (fonts.additional) {
58
+ fontConfigs['additional'] = fonts.additional.map((font) => ({
59
+ cssVariable: font.cssVariable,
60
+ preload: font.preload ?? [{ style: 'normal' }],
61
+ }));
62
+ }
53
63
  return fontConfigs;
54
64
  }
55
65
 
@@ -141,6 +151,7 @@ export function normalizeFonts(fonts: StlDocsFontConfig | undefined): StlDocsFon
141
151
  primary: fonts?.primary ?? defaultPrimary,
142
152
  heading: fonts?.heading ?? undefined,
143
153
  mono: fonts?.mono ?? defaultMono,
154
+ additional: fonts?.additional ?? [],
144
155
  };
145
156
  }
146
157
 
@@ -168,5 +179,8 @@ export function flattenFonts(fonts: StlDocsFontConfig | undefined): AstroFontCon
168
179
  cssVariable: '--stl-typography-font-mono' as const,
169
180
  } as AstroFontConfigEntry);
170
181
  }
182
+ if (fonts.additional) {
183
+ fontConfigs.push(...fonts.additional.map((font) => font as AstroFontConfigEntry));
184
+ }
171
185
  return fontConfigs;
172
186
  }
package/stl-docs/index.ts CHANGED
@@ -197,7 +197,7 @@ function stainlessDocsIntegration(
197
197
 
198
198
  updateConfig({
199
199
  experimental: {
200
- fonts: flattenFonts(config.fonts),
200
+ fonts: [...flattenFonts(config.fonts), ...(astroConfig.experimental?.fonts ?? [])],
201
201
  },
202
202
  vite: {
203
203
  plugins: [
@@ -8,22 +8,37 @@ import { toMarkdown } from './proseMarkdown/toMarkdown';
8
8
  import { NormalizedStainlessDocsConfig } from './loadStlDocsConfig';
9
9
  import { buildProseIndex } from '@stainless-api/docs-search/providers/algolia';
10
10
 
11
- interface ContentBlock {
12
- type: 'header' | 'content';
13
- tag?: string;
14
- id?: string;
15
- text: string;
16
- }
11
+ type ContentBlock =
12
+ | { type: 'header'; tag: string; id: string; text: string }
13
+ | { type: 'content'; tag: string; text: string }
14
+ | { type: 'code'; tag: string; language?: string; text: string };
15
+
16
+ class SectionContext {
17
+ headers: { level: number; text: string }[] = [];
18
+ headerId: string | undefined;
19
+ headerTag: string | undefined;
20
+ headerText: string | undefined;
21
+ hasContent = false;
22
+
23
+ get(): string | undefined {
24
+ if (this.headers.length === 0) return;
25
+ return this.headers.map((h) => h.text).join(' > ');
26
+ }
17
27
 
18
- // Chunking configuration
19
- // We target 64-256 tokens per chunk, using ~1.3 tokens/word for English text
20
- const TOKENS_PER_WORD = 1.3;
21
- const MIN_TOKENS = 64;
22
- const MAX_TOKENS = 256;
23
- const MIN_WORDS = Math.floor(MIN_TOKENS / TOKENS_PER_WORD); // ~49 words
24
- const MAX_WORDS = Math.floor(MAX_TOKENS / TOKENS_PER_WORD); // ~197 words
25
- const LINE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.75) / TOKENS_PER_WORD); // ~148 words
26
- const SENTENCE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.875) / TOKENS_PER_WORD); // ~172 words
28
+ header({ id, tag, text }: { id: string; tag: string; text: string }) {
29
+ const level = getHeaderLevel(tag);
30
+ if (level > 0) {
31
+ while (this.headers.length > 0 && this.headers[this.headers.length - 1]!.level >= level) {
32
+ this.headers.pop();
33
+ }
34
+ this.headers.push({ level, text });
35
+ }
36
+ this.headerId = id;
37
+ this.headerTag = tag;
38
+ this.headerText = text;
39
+ this.hasContent = false;
40
+ }
41
+ }
27
42
 
28
43
  // Generate a URL-safe ID from header text (e.g., "OpenAPI Config" -> "openapi-config")
29
44
  function slugify(text: string): string {
@@ -39,22 +54,97 @@ function isTableCellBoundary(word: string): boolean {
39
54
  return word.endsWith('|') && !word.endsWith('\\|');
40
55
  }
41
56
 
57
+ /**
58
+ * Extracts the header level from a tag like "h1", "h2", etc.
59
+ */
60
+ function getHeaderLevel(tag: string): number {
61
+ const match = tag.match(/^h(\d)$/);
62
+ return match ? parseInt(match[1]!, 10) : 0;
63
+ }
64
+
65
+ // Chunking configuration
66
+ // We target 64-256 tokens per chunk, using ~1.3 tokens/word for English text
67
+ const TOKENS_PER_WORD = 1.3;
68
+ const MIN_TOKENS = 64;
69
+ const MAX_TOKENS = 256;
70
+ const MIN_WORDS = Math.floor(MIN_TOKENS / TOKENS_PER_WORD); // ~49 words
71
+ const MAX_WORDS = Math.floor(MAX_TOKENS / TOKENS_PER_WORD); // ~197 words
72
+ const LINE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.75) / TOKENS_PER_WORD); // ~148 words
73
+ const SENTENCE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.875) / TOKENS_PER_WORD); // ~172 words
74
+
75
+ /**
76
+ * Chunks text content into segments of 64-256 tokens using word-based boundaries.
77
+ * Prefers breaking at sentence endings for natural chunk boundaries.
78
+ */
79
+ function chunkTextByWords(text: string): string[] {
80
+ const words = text.split(/\s+/).filter((w) => w.length > 0);
81
+
82
+ if (words.length <= MAX_WORDS) {
83
+ return words.length > 0 ? [words.join(' ')] : [];
84
+ }
85
+
86
+ const chunks: string[] = [];
87
+ let currentChunk: string[] = [];
88
+
89
+ for (const word of words) {
90
+ currentChunk.push(word);
91
+
92
+ // Force break at max words
93
+ if (currentChunk.length >= MAX_WORDS) {
94
+ chunks.push(currentChunk.join(' '));
95
+ currentChunk = [];
96
+ continue;
97
+ }
98
+
99
+ // Prefer breaking at sentence boundaries after threshold
100
+ if (currentChunk.length >= SENTENCE_BREAK_WORDS && /[.!?]["']?$/.test(word)) {
101
+ chunks.push(currentChunk.join(' '));
102
+ currentChunk = [];
103
+ }
104
+ }
105
+
106
+ if (currentChunk.length > 0) {
107
+ if (currentChunk.length < MIN_WORDS && chunks.length > 0) {
108
+ const lastChunk = chunks[chunks.length - 1]!;
109
+ const mergedWords = lastChunk.split(/\s+/).length + currentChunk.length;
110
+ if (mergedWords <= MAX_WORDS) {
111
+ chunks[chunks.length - 1] = lastChunk + ' ' + currentChunk.join(' ');
112
+ } else {
113
+ chunks.push(currentChunk.join(' '));
114
+ }
115
+ } else {
116
+ chunks.push(currentChunk.join(' '));
117
+ }
118
+ }
119
+
120
+ return chunks;
121
+ }
122
+
123
+ type ContentBlockChunk = {
124
+ type: 'prose';
125
+ content: string;
126
+ headerId?: string;
127
+ headerTag?: string;
128
+ tag?: string;
129
+ language?: string;
130
+ sectionContext?: string;
131
+ };
132
+
42
133
  /**
43
134
  * Chunks content blocks into segments of 64-256 tokens.
44
135
  *
45
136
  * Chunking strategy:
46
- * 1. Break at headers if chunk has >= MIN_WORDS, otherwise merge with next section
137
+ * 1. Break at headers to keep sections isolated
47
138
  * 2. Prefer breaking at line/table boundaries after LINE_BREAK_WORDS (~148 words / ~192 tokens)
48
139
  * 3. Break at sentence endings after SENTENCE_BREAK_WORDS (~172 words / ~224 tokens)
49
140
  * 4. Force break at MAX_WORDS, preferring table row boundaries if available
50
- * 5. Header context (id/tag) is preserved for continuation chunks
141
+ * 5. Section context (header hierarchy) is recorded alongside each chunk for discoverability
51
142
  */
52
- function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: string; headerTag?: string }[] {
53
- const chunks: { content: string; headerId?: string; headerTag?: string }[] = [];
143
+ function chunkByWords(blocks: ContentBlock[]): ContentBlockChunk[] {
144
+ const chunks: ContentBlockChunk[] = [];
54
145
 
55
146
  let currentChunk: string[] = [];
56
- let currentHeaderId: string | undefined;
57
- let currentHeaderTag: string | undefined;
147
+ const ctx = new SectionContext();
58
148
 
59
149
  // Flush current chunk to output. If splitAt is provided, keep words after that index for next chunk.
60
150
  const flushChunk = (splitAt?: number) => {
@@ -64,11 +154,17 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
64
154
  const wordsToKeep = splitAt !== undefined ? currentChunk.slice(splitAt) : [];
65
155
 
66
156
  if (wordsToFlush.length > 0) {
157
+ const chunkText = wordsToFlush.join(' ').trim();
158
+ const sectionContext = ctx.get();
159
+
67
160
  chunks.push({
68
- content: wordsToFlush.join(' ').trim(),
69
- headerId: currentHeaderId,
70
- headerTag: currentHeaderTag,
161
+ type: 'prose',
162
+ content: chunkText,
163
+ headerId: ctx.headerId,
164
+ headerTag: ctx.headerTag,
165
+ sectionContext: sectionContext || undefined,
71
166
  });
167
+ ctx.hasContent = true;
72
168
  }
73
169
  currentChunk = wordsToKeep;
74
170
  };
@@ -89,18 +185,33 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
89
185
 
90
186
  for (const block of blocks) {
91
187
  if (block.type === 'header') {
92
- // Flush at header boundaries only if chunk meets minimum size
93
- // This avoids creating tiny chunks for headers with little content
94
- if (currentChunk.length >= MIN_WORDS) {
95
- flushChunk();
188
+ flushChunk();
189
+ ctx.header(block);
190
+ continue;
191
+ }
192
+
193
+ // Chunk code blocks separately; they tend to be more important.
194
+ if (block.type === 'code') {
195
+ flushChunk();
196
+ const codeText = block.text.trim();
197
+ if (codeText) {
198
+ for (const chunkText of chunkTextByWords(codeText)) {
199
+ chunks.push({
200
+ type: 'prose',
201
+ content: chunkText,
202
+ headerId: ctx.headerId,
203
+ tag: 'code',
204
+ language: block.language,
205
+ sectionContext: ctx.get(),
206
+ });
207
+ ctx.hasContent = true;
208
+ }
96
209
  }
97
- currentHeaderId = block.id;
98
- currentHeaderTag = block.tag;
99
- // Include header text at the start of the new chunk
100
- currentChunk.push(...block.text.split(/\s+/).filter((w) => w.length > 0));
101
210
  continue;
102
211
  }
103
212
 
213
+ if (block.type !== 'content') continue;
214
+
104
215
  // Split by newlines first to preserve line boundary information
105
216
  const lines = block.text.split(/\n/);
106
217
  let inCodeBlock = false;
@@ -108,8 +219,8 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
108
219
  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
109
220
  const line = lines[lineIdx]!;
110
221
 
111
- // Track code block boundaries
112
- if (/^(`{3,}|~{3,})/.test(line.trim())) {
222
+ // Track code block boundaries (standalone fences only)
223
+ if (/^(`{3,}|~{3,})([a-zA-Z0-9+-]*)?\s*$/.test(line.trim())) {
113
224
  inCodeBlock = !inCodeBlock;
114
225
  }
115
226
 
@@ -155,14 +266,14 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
155
266
  }
156
267
 
157
268
  /**
158
- * Parses markdown into content blocks, identifying headers and content sections.
159
- * Tracks fenced code blocks to avoid treating # comments in code as headers.
269
+ * Parses markdown into content blocks, identifying headers, content sections, and code blocks.
270
+ * Code blocks are extracted separately with language metadata for specialized indexing.
160
271
  */
161
272
  function parseMarkdown(markdown: string): ContentBlock[] {
162
273
  const blocks: ContentBlock[] = [];
163
274
 
164
275
  // Extract title from frontmatter and treat it as h1
165
- const frontmatterMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
276
+ const frontmatterMatch = markdown.match(/^---\r?\n([\s\S]*?)\r?\n---/);
166
277
  if (frontmatterMatch) {
167
278
  const frontmatter = frontmatterMatch[1]!;
168
279
  const titleMatch = frontmatter.match(/^title:\s*(.+)$/m);
@@ -178,61 +289,99 @@ function parseMarkdown(markdown: string): ContentBlock[] {
178
289
  }
179
290
 
180
291
  // Remove frontmatter
181
- const content = markdown.replace(/^---[\s\S]*?---\n*/, '').trim();
292
+ const content = markdown.replace(/^---[\s\S]*?---\r?\n*/, '').trim();
182
293
 
183
294
  // Split into lines and process
184
295
  const lines = content.split('\n');
185
296
  let currentContent: string[] = [];
186
297
  let inCodeBlock = false;
298
+ let codeBlockLanguage: string | undefined;
299
+ let codeBlockContent: string[] = [];
187
300
 
188
301
  const flushContent = () => {
189
302
  const text = currentContent.join('\n').trim();
190
303
  if (text) {
191
- blocks.push({ type: 'content', text });
304
+ blocks.push({ type: 'content', tag: 'p', text });
192
305
  }
193
306
  currentContent = [];
194
307
  };
195
308
 
309
+ const flushCodeBlock = () => {
310
+ if (codeBlockContent.length > 0) {
311
+ const code = codeBlockContent.join('\n').trim();
312
+ if (code) {
313
+ blocks.push({
314
+ type: 'code',
315
+ tag: 'code',
316
+ text: code,
317
+ language: codeBlockLanguage || undefined,
318
+ });
319
+ }
320
+ }
321
+ codeBlockContent = [];
322
+ codeBlockLanguage = undefined;
323
+ };
324
+
196
325
  for (const line of lines) {
197
326
  // Track fenced code blocks (``` or ~~~)
198
327
  // Only match standalone markers: ```[language] with nothing else on the line
199
328
  // This avoids matching inline code blocks in table cells like "``` Then content..."
200
- if (/^(`{3,}|~{3,})([a-zA-Z0-9]*)?(\s*)$/.test(line)) {
201
- inCodeBlock = !inCodeBlock;
202
- currentContent.push(line);
329
+ const codeBlockMatch = line.match(/^(`{3,}|~{3,})([a-zA-Z0-9+-]*)?\s*$/);
330
+ if (codeBlockMatch) {
331
+ if (!inCodeBlock) {
332
+ flushContent();
333
+ inCodeBlock = true;
334
+ codeBlockLanguage = codeBlockMatch[2] || undefined;
335
+ } else {
336
+ flushCodeBlock();
337
+ inCodeBlock = false;
338
+ }
339
+ continue;
340
+ }
341
+
342
+ if (inCodeBlock) {
343
+ codeBlockContent.push(line);
203
344
  continue;
204
345
  }
205
346
 
206
347
  // Only match headers outside of code blocks
207
- if (!inCodeBlock) {
208
- const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
348
+ const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
209
349
 
210
- if (headerMatch) {
211
- flushContent();
212
- const level = headerMatch[1]!.length;
213
- const headerText = headerMatch[2]!.trim();
214
- blocks.push({
215
- type: 'header',
216
- tag: `h${level}`,
217
- id: slugify(headerText),
218
- text: headerText,
219
- });
220
- continue;
221
- }
350
+ if (headerMatch) {
351
+ flushContent();
352
+ const level = headerMatch[1]!.length;
353
+ const headerText = headerMatch[2]!.trim();
354
+ blocks.push({
355
+ type: 'header',
356
+ tag: `h${level}`,
357
+ id: slugify(headerText),
358
+ text: headerText,
359
+ });
360
+ continue;
222
361
  }
223
362
 
224
363
  currentContent.push(line);
225
364
  }
226
365
 
366
+ flushCodeBlock();
227
367
  flushContent();
228
368
  return blocks;
229
369
  }
230
370
 
371
+ export type IndexEntry = {
372
+ chunk: { id: string; index: number; total: number };
373
+ id: string;
374
+ tag: string;
375
+ content: string;
376
+ language?: string;
377
+ sectionContext?: string;
378
+ };
379
+
231
380
  /**
232
381
  * Extracts and chunks markdown content for search indexing.
233
- * Yields chunk objects with content, header context, and chunk metadata.
382
+ * Yields prose and code chunks with section context and language metadata.
234
383
  */
235
- export function* indexMarkdown(markdown: string) {
384
+ export function* indexMarkdown(markdown: string): Generator<IndexEntry> {
236
385
  const blocks = parseMarkdown(markdown);
237
386
  const chunks = chunkByWords(blocks);
238
387
  const documentId = crypto.randomUUID();
@@ -240,8 +389,10 @@ export function* indexMarkdown(markdown: string) {
240
389
  for (const [index, chunk] of chunks.entries()) {
241
390
  yield {
242
391
  id: chunk.headerId ?? '',
243
- tag: chunk.headerTag ?? '',
392
+ tag: chunk.tag ?? chunk.headerTag ?? '',
244
393
  content: chunk.content,
394
+ ...(chunk.sectionContext ? { sectionContext: chunk.sectionContext } : {}),
395
+ ...(chunk.language ? { language: chunk.language } : {}),
245
396
  chunk: {
246
397
  id: documentId,
247
398
  index,
@@ -251,64 +402,68 @@ export function* indexMarkdown(markdown: string) {
251
402
  }
252
403
  }
253
404
 
254
- function chunkHTMLByWords(content: string, chunkSize: number = 30000, chunkOverlap: number = 10) {
255
- if (Buffer.byteLength(content) < chunkSize) return [content];
405
+ const DEFAULT_ROOT = 'main';
406
+ const DEFAULT_PATTERN = 'h1, h2, h3, h4, h5, h6, p, li, pre code';
256
407
 
257
- const words = content.split(/\s+/);
258
- const chunks: string[] = [];
259
-
260
- let currentChunk: string[] = [];
261
- let currentSize = 0;
408
+ /**
409
+ * Indexes HTML content for search, with section context and code language extraction.
410
+ *
411
+ * Features:
412
+ * - Tracks header hierarchy to prepend section context (e.g., "Guide > Setup: ...")
413
+ * - Extracts language metadata from code blocks (class="language-js")
414
+ * - Uses word-based chunking with sentence boundary detection
415
+ */
416
+ export function* indexHTML(
417
+ content: string,
418
+ root = DEFAULT_ROOT,
419
+ pattern = DEFAULT_PATTERN,
420
+ ): Generator<IndexEntry> {
421
+ const $ = cheerio.load(content);
422
+ const matches = $(root).find(pattern);
262
423
 
263
- for (const word of words) {
264
- const wordSize = Buffer.byteLength(word + ' ', 'utf-8');
424
+ const ctx = new SectionContext();
265
425
 
266
- if (currentSize + wordSize > chunkSize && currentChunk.length > 0) {
267
- chunks.push(currentChunk.join(' '));
426
+ for (const match of matches) {
427
+ const tagName = match.tagName.toLowerCase();
428
+ const rawText = $(match).text().trim();
268
429
 
269
- const overlapStart = Math.max(0, currentChunk.length - chunkOverlap);
270
- currentChunk = currentChunk.slice(overlapStart);
271
- currentSize = Buffer.byteLength(currentChunk.join(' '), 'utf-8');
430
+ if (getHeaderLevel(tagName) > 0) {
431
+ ctx.header({ id: $(match).attr('id') ?? slugify(rawText), tag: tagName, text: rawText });
432
+ continue;
272
433
  }
273
434
 
274
- currentChunk.push(word);
275
- currentSize += wordSize;
276
- }
277
-
278
- if (currentChunk.length > 0) {
279
- chunks.push(currentChunk.join(' '));
280
- }
281
-
282
- return chunks;
283
- }
284
-
285
- export function* indexHTML(content: string, root: string, pattern: string) {
286
- const $ = cheerio.load(content);
287
- const matches = $(root).find(pattern);
435
+ // Check if this is a code block and extract language
436
+ const isCode = tagName === 'code' && $(match).parent().is('pre');
437
+ let language: string | undefined;
438
+ if (isCode) {
439
+ const classes = $(match).attr('class') || '';
440
+ const langMatch = classes.match(/(?:language-|lang-)([a-zA-Z0-9+-]+)/);
441
+ language = langMatch ? langMatch[1] : undefined;
442
+ }
288
443
 
289
- for (const match of matches) {
290
- const rawText = $(match).text().trim();
291
- const chunks = chunkHTMLByWords(rawText);
444
+ // Build content with section context
445
+ const sectionContext = ctx.get();
446
+ const chunks = chunkTextByWords(rawText);
292
447
  const chunkId = crypto.randomUUID();
293
448
 
294
- for (const [chunkN, content] of chunks.entries()) {
449
+ for (const [chunkN, chunkText] of chunks.entries()) {
295
450
  yield {
296
- id: $(match).attr('id'),
297
- tag: match.tagName.toLowerCase(),
298
- content,
451
+ id: ctx.headerId ?? $(match).attr('id') ?? chunkId,
452
+ tag: isCode ? 'code' : tagName,
453
+ content: chunkText,
454
+ ...(sectionContext ? { sectionContext } : {}),
455
+ ...(language && { language }),
299
456
  chunk: {
300
457
  id: chunkId,
301
458
  index: chunkN,
302
459
  total: chunks.length,
303
460
  },
304
461
  };
462
+ ctx.hasContent = true;
305
463
  }
306
464
  }
307
465
  }
308
466
 
309
- const root = 'main';
310
- const pattern = 'h1, h2, h3, h4, h5, h6, p, li';
311
-
312
467
  export function stainlessDocsAlgoliaProseIndexing({
313
468
  apiReferenceBasePath,
314
469
  }: {
@@ -338,7 +493,7 @@ export function stainlessDocsAlgoliaProseIndexing({
338
493
  const objects = [];
339
494
  for (const absHtmlPath of pagesToRender) {
340
495
  const content = await readFile(absHtmlPath, 'utf-8');
341
- const idx = indexHTML(content, root, pattern);
496
+ const idx = indexHTML(content);
342
497
  for (const entry of idx)
343
498
  objects.push({
344
499
  ...entry,
@@ -403,6 +558,7 @@ export function stainlessDocsVectorProseIndexing(
403
558
  id: string;
404
559
  tag: string;
405
560
  content: string;
561
+ language?: string;
406
562
  kind: 'prose';
407
563
  source: string;
408
564
  }[] = [];
@@ -412,7 +568,7 @@ export function stainlessDocsVectorProseIndexing(
412
568
 
413
569
  if (markdown) {
414
570
  const idx = indexMarkdown(markdown);
415
- for (const { chunk, ...entry } of idx)
571
+ for (const { chunk: _, ...entry } of idx)
416
572
  objects.push({
417
573
  ...entry,
418
574
  kind: 'prose',
@@ -57,6 +57,7 @@ declare module 'virtual:stl-docs-virtual-module' {
57
57
  primary?: FontConfig;
58
58
  heading?: FontConfig;
59
59
  mono?: FontConfig;
60
+ additional?: FontConfig[];
60
61
  };
61
62
  }
62
63