@stainless-api/docs 0.1.0-beta.89 → 0.1.0-beta.90
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/package.json +5 -5
- package/plugin/components/SDKSelect.astro +2 -7
- package/plugin/replaceSidebarPlaceholderMiddleware.ts +0 -3
- package/stl-docs/components/Head.astro +3 -1
- package/stl-docs/fonts.ts +15 -1
- package/stl-docs/index.ts +1 -1
- package/stl-docs/proseSearchIndexing.ts +255 -99
- package/virtual-module.d.ts +1 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
# @stainless-api/docs
|
|
2
2
|
|
|
3
|
+
## 0.1.0-beta.90
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- b8f1f3c: improve prose chunking
|
|
8
|
+
|
|
9
|
+
### Patch Changes
|
|
10
|
+
|
|
11
|
+
- 3de4232: support additional extra custom fonts
|
|
12
|
+
- c77e607: fixes issue where sdk select wasnt showing on readme pages
|
|
13
|
+
|
|
3
14
|
## 0.1.0-beta.89
|
|
4
15
|
|
|
5
16
|
### Minor Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@stainless-api/docs",
|
|
3
|
-
"version": "0.1.0-beta.
|
|
3
|
+
"version": "0.1.0-beta.90",
|
|
4
4
|
"publishConfig": {
|
|
5
5
|
"access": "public"
|
|
6
6
|
},
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
"remark-gfm": "^4.0.1",
|
|
52
52
|
"remark-github-alerts": "^0.1.1",
|
|
53
53
|
"remark-stringify": "^11.0.0",
|
|
54
|
-
"shiki": "^3.
|
|
54
|
+
"shiki": "^3.22.0",
|
|
55
55
|
"unified": "^11.0.5",
|
|
56
56
|
"vite-plugin-prebundle-workers": "^0.2.0",
|
|
57
57
|
"web-worker": "^1.5.0",
|
|
@@ -64,10 +64,10 @@
|
|
|
64
64
|
"@astrojs/check": "^0.9.6",
|
|
65
65
|
"@markdoc/markdoc": "^0.5.4",
|
|
66
66
|
"@types/node": "24.10.9",
|
|
67
|
-
"@types/react": "19.2.
|
|
67
|
+
"@types/react": "19.2.10",
|
|
68
68
|
"@types/react-dom": "^19.2.3",
|
|
69
|
-
"react": "^19.2.
|
|
70
|
-
"react-dom": "^19.2.
|
|
69
|
+
"react": "^19.2.4",
|
|
70
|
+
"react-dom": "^19.2.4",
|
|
71
71
|
"tsx": "^4.21.0",
|
|
72
72
|
"typescript": "5.9.3",
|
|
73
73
|
"vite": "^6.4.1",
|
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
import { parseRoute } from '@stainless-api/docs-ui/routing';
|
|
3
|
-
import {
|
|
4
|
-
RESOLVED_API_REFERENCE_PATH,
|
|
5
|
-
DEFAULT_LANGUAGE,
|
|
6
|
-
EXCLUDE_LANGUAGES,
|
|
7
|
-
} from 'virtual:stl-starlight-virtual-module';
|
|
3
|
+
import { DEFAULT_LANGUAGE, EXCLUDE_LANGUAGES } from 'virtual:stl-starlight-virtual-module';
|
|
8
4
|
import { Languages } from '../languages';
|
|
9
5
|
import { SDKSelectReactComponent } from '../react/Routing';
|
|
10
6
|
import { getSDKJSONInSSR } from '../specs/fetchSpecSSR';
|
|
@@ -32,8 +28,7 @@ const options = getDocsLanguages(spec, EXCLUDE_LANGUAGES).map((value) => ({
|
|
|
32
28
|
selected: data.language === value,
|
|
33
29
|
}));
|
|
34
30
|
|
|
35
|
-
const readmeSlug =
|
|
36
|
-
language === 'http' ? RESOLVED_API_REFERENCE_PATH : `${RESOLVED_API_REFERENCE_PATH}/${language}`;
|
|
31
|
+
const readmeSlug = language === 'http' ? API_REFERENCE_BASE_PATH : `${API_REFERENCE_BASE_PATH}/${language}`;
|
|
37
32
|
---
|
|
38
33
|
|
|
39
34
|
<span
|
|
@@ -21,9 +21,6 @@ function markCurrentItems(sidebar: SidebarEntry[], currentSlug: string) {
|
|
|
21
21
|
for (const entry of entries) {
|
|
22
22
|
if (entry.type === 'link') {
|
|
23
23
|
entry.isCurrent = removeTrailingSlash(entry.href) === normalizedCurrentSlug;
|
|
24
|
-
if (entry.isCurrent) {
|
|
25
|
-
return;
|
|
26
|
-
}
|
|
27
24
|
}
|
|
28
25
|
if (entry.type === 'group') {
|
|
29
26
|
recursiveMarkCurrent(entry.entries);
|
|
@@ -5,11 +5,13 @@ import Default from '@astrojs/starlight/components/Head.astro';
|
|
|
5
5
|
import path from 'path';
|
|
6
6
|
|
|
7
7
|
const mdPath = path.posix.join(Astro.url.pathname, 'index.md');
|
|
8
|
+
const fonts = [FONTS.primary, FONTS.heading, FONTS.mono, ...(FONTS.additional ?? [])].filter(Boolean);
|
|
8
9
|
---
|
|
9
10
|
|
|
10
11
|
<Default />
|
|
11
12
|
|
|
12
|
-
{
|
|
13
|
+
{fonts.map((font) => <Font cssVariable={font.cssVariable} preload={font.preload} />)}
|
|
14
|
+
|
|
13
15
|
<link rel="alternate" type="text/markdown" href={mdPath} />
|
|
14
16
|
|
|
15
17
|
<script>
|
package/stl-docs/fonts.ts
CHANGED
|
@@ -6,16 +6,19 @@ import type { FontPreloadFilter } from 'astro:assets';
|
|
|
6
6
|
type AstroFontConfigEntry = Defined<AstroConfig['experimental']['fonts']>[number];
|
|
7
7
|
|
|
8
8
|
// Apply Omit to each member of the union while preserving union structure
|
|
9
|
+
type PreloadFilter = { preload?: FontPreloadFilter };
|
|
9
10
|
export type StlDocsFontConfigEntry = (AstroFontConfigEntry extends infer T
|
|
10
11
|
? T extends unknown
|
|
11
12
|
? Omit<T, 'cssVariable'>
|
|
12
13
|
: never
|
|
13
|
-
: never) &
|
|
14
|
+
: never) &
|
|
15
|
+
PreloadFilter;
|
|
14
16
|
|
|
15
17
|
export type StlDocsFontConfig = {
|
|
16
18
|
primary?: StlDocsFontConfigEntry;
|
|
17
19
|
heading?: StlDocsFontConfigEntry;
|
|
18
20
|
mono?: StlDocsFontConfigEntry;
|
|
21
|
+
additional?: (AstroFontConfigEntry & PreloadFilter)[];
|
|
19
22
|
};
|
|
20
23
|
const latinFeatureSettings = "'ss01' on, 'ss03' on, 'ss04' on, 'ss06' on, 'ss08' on";
|
|
21
24
|
/* prettier-ignore */
|
|
@@ -31,6 +34,7 @@ export function getFontRoles(fonts: StlDocsFontConfig | undefined) {
|
|
|
31
34
|
primary?: { cssVariable: string; preload?: FontPreloadFilter };
|
|
32
35
|
heading?: { cssVariable: string; preload?: FontPreloadFilter };
|
|
33
36
|
mono?: { cssVariable: string; preload?: FontPreloadFilter };
|
|
37
|
+
additional?: { cssVariable: string; preload?: FontPreloadFilter }[];
|
|
34
38
|
} = {};
|
|
35
39
|
if (fonts.primary) {
|
|
36
40
|
fontConfigs['primary'] = {
|
|
@@ -50,6 +54,12 @@ export function getFontRoles(fonts: StlDocsFontConfig | undefined) {
|
|
|
50
54
|
preload: fonts.mono.preload ?? [{ style: 'normal' }],
|
|
51
55
|
};
|
|
52
56
|
}
|
|
57
|
+
if (fonts.additional) {
|
|
58
|
+
fontConfigs['additional'] = fonts.additional.map((font) => ({
|
|
59
|
+
cssVariable: font.cssVariable,
|
|
60
|
+
preload: font.preload ?? [{ style: 'normal' }],
|
|
61
|
+
}));
|
|
62
|
+
}
|
|
53
63
|
return fontConfigs;
|
|
54
64
|
}
|
|
55
65
|
|
|
@@ -141,6 +151,7 @@ export function normalizeFonts(fonts: StlDocsFontConfig | undefined): StlDocsFon
|
|
|
141
151
|
primary: fonts?.primary ?? defaultPrimary,
|
|
142
152
|
heading: fonts?.heading ?? undefined,
|
|
143
153
|
mono: fonts?.mono ?? defaultMono,
|
|
154
|
+
additional: fonts?.additional ?? [],
|
|
144
155
|
};
|
|
145
156
|
}
|
|
146
157
|
|
|
@@ -168,5 +179,8 @@ export function flattenFonts(fonts: StlDocsFontConfig | undefined): AstroFontCon
|
|
|
168
179
|
cssVariable: '--stl-typography-font-mono' as const,
|
|
169
180
|
} as AstroFontConfigEntry);
|
|
170
181
|
}
|
|
182
|
+
if (fonts.additional) {
|
|
183
|
+
fontConfigs.push(...fonts.additional.map((font) => font as AstroFontConfigEntry));
|
|
184
|
+
}
|
|
171
185
|
return fontConfigs;
|
|
172
186
|
}
|
package/stl-docs/index.ts
CHANGED
|
@@ -8,22 +8,37 @@ import { toMarkdown } from './proseMarkdown/toMarkdown';
|
|
|
8
8
|
import { NormalizedStainlessDocsConfig } from './loadStlDocsConfig';
|
|
9
9
|
import { buildProseIndex } from '@stainless-api/docs-search/providers/algolia';
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
type: 'header'
|
|
13
|
-
tag
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
type ContentBlock =
|
|
12
|
+
| { type: 'header'; tag: string; id: string; text: string }
|
|
13
|
+
| { type: 'content'; tag: string; text: string }
|
|
14
|
+
| { type: 'code'; tag: string; language?: string; text: string };
|
|
15
|
+
|
|
16
|
+
class SectionContext {
|
|
17
|
+
headers: { level: number; text: string }[] = [];
|
|
18
|
+
headerId: string | undefined;
|
|
19
|
+
headerTag: string | undefined;
|
|
20
|
+
headerText: string | undefined;
|
|
21
|
+
hasContent = false;
|
|
22
|
+
|
|
23
|
+
get(): string | undefined {
|
|
24
|
+
if (this.headers.length === 0) return;
|
|
25
|
+
return this.headers.map((h) => h.text).join(' > ');
|
|
26
|
+
}
|
|
17
27
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
header({ id, tag, text }: { id: string; tag: string; text: string }) {
|
|
29
|
+
const level = getHeaderLevel(tag);
|
|
30
|
+
if (level > 0) {
|
|
31
|
+
while (this.headers.length > 0 && this.headers[this.headers.length - 1]!.level >= level) {
|
|
32
|
+
this.headers.pop();
|
|
33
|
+
}
|
|
34
|
+
this.headers.push({ level, text });
|
|
35
|
+
}
|
|
36
|
+
this.headerId = id;
|
|
37
|
+
this.headerTag = tag;
|
|
38
|
+
this.headerText = text;
|
|
39
|
+
this.hasContent = false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
27
42
|
|
|
28
43
|
// Generate a URL-safe ID from header text (e.g., "OpenAPI Config" -> "openapi-config")
|
|
29
44
|
function slugify(text: string): string {
|
|
@@ -39,22 +54,97 @@ function isTableCellBoundary(word: string): boolean {
|
|
|
39
54
|
return word.endsWith('|') && !word.endsWith('\\|');
|
|
40
55
|
}
|
|
41
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Extracts the header level from a tag like "h1", "h2", etc.
|
|
59
|
+
*/
|
|
60
|
+
function getHeaderLevel(tag: string): number {
|
|
61
|
+
const match = tag.match(/^h(\d)$/);
|
|
62
|
+
return match ? parseInt(match[1]!, 10) : 0;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Chunking configuration
|
|
66
|
+
// We target 64-256 tokens per chunk, using ~1.3 tokens/word for English text
|
|
67
|
+
const TOKENS_PER_WORD = 1.3;
|
|
68
|
+
const MIN_TOKENS = 64;
|
|
69
|
+
const MAX_TOKENS = 256;
|
|
70
|
+
const MIN_WORDS = Math.floor(MIN_TOKENS / TOKENS_PER_WORD); // ~49 words
|
|
71
|
+
const MAX_WORDS = Math.floor(MAX_TOKENS / TOKENS_PER_WORD); // ~197 words
|
|
72
|
+
const LINE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.75) / TOKENS_PER_WORD); // ~148 words
|
|
73
|
+
const SENTENCE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.875) / TOKENS_PER_WORD); // ~172 words
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Chunks text content into segments of 64-256 tokens using word-based boundaries.
|
|
77
|
+
* Prefers breaking at sentence endings for natural chunk boundaries.
|
|
78
|
+
*/
|
|
79
|
+
function chunkTextByWords(text: string): string[] {
|
|
80
|
+
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
|
81
|
+
|
|
82
|
+
if (words.length <= MAX_WORDS) {
|
|
83
|
+
return words.length > 0 ? [words.join(' ')] : [];
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const chunks: string[] = [];
|
|
87
|
+
let currentChunk: string[] = [];
|
|
88
|
+
|
|
89
|
+
for (const word of words) {
|
|
90
|
+
currentChunk.push(word);
|
|
91
|
+
|
|
92
|
+
// Force break at max words
|
|
93
|
+
if (currentChunk.length >= MAX_WORDS) {
|
|
94
|
+
chunks.push(currentChunk.join(' '));
|
|
95
|
+
currentChunk = [];
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Prefer breaking at sentence boundaries after threshold
|
|
100
|
+
if (currentChunk.length >= SENTENCE_BREAK_WORDS && /[.!?]["']?$/.test(word)) {
|
|
101
|
+
chunks.push(currentChunk.join(' '));
|
|
102
|
+
currentChunk = [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (currentChunk.length > 0) {
|
|
107
|
+
if (currentChunk.length < MIN_WORDS && chunks.length > 0) {
|
|
108
|
+
const lastChunk = chunks[chunks.length - 1]!;
|
|
109
|
+
const mergedWords = lastChunk.split(/\s+/).length + currentChunk.length;
|
|
110
|
+
if (mergedWords <= MAX_WORDS) {
|
|
111
|
+
chunks[chunks.length - 1] = lastChunk + ' ' + currentChunk.join(' ');
|
|
112
|
+
} else {
|
|
113
|
+
chunks.push(currentChunk.join(' '));
|
|
114
|
+
}
|
|
115
|
+
} else {
|
|
116
|
+
chunks.push(currentChunk.join(' '));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return chunks;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
type ContentBlockChunk = {
|
|
124
|
+
type: 'prose';
|
|
125
|
+
content: string;
|
|
126
|
+
headerId?: string;
|
|
127
|
+
headerTag?: string;
|
|
128
|
+
tag?: string;
|
|
129
|
+
language?: string;
|
|
130
|
+
sectionContext?: string;
|
|
131
|
+
};
|
|
132
|
+
|
|
42
133
|
/**
|
|
43
134
|
* Chunks content blocks into segments of 64-256 tokens.
|
|
44
135
|
*
|
|
45
136
|
* Chunking strategy:
|
|
46
|
-
* 1. Break at headers
|
|
137
|
+
* 1. Break at headers to keep sections isolated
|
|
47
138
|
* 2. Prefer breaking at line/table boundaries after LINE_BREAK_WORDS (~148 words / ~192 tokens)
|
|
48
139
|
* 3. Break at sentence endings after SENTENCE_BREAK_WORDS (~172 words / ~224 tokens)
|
|
49
140
|
* 4. Force break at MAX_WORDS, preferring table row boundaries if available
|
|
50
|
-
* 5.
|
|
141
|
+
* 5. Section context (header hierarchy) is recorded alongside each chunk for discoverability
|
|
51
142
|
*/
|
|
52
|
-
function chunkByWords(blocks: ContentBlock[]):
|
|
53
|
-
const chunks:
|
|
143
|
+
function chunkByWords(blocks: ContentBlock[]): ContentBlockChunk[] {
|
|
144
|
+
const chunks: ContentBlockChunk[] = [];
|
|
54
145
|
|
|
55
146
|
let currentChunk: string[] = [];
|
|
56
|
-
|
|
57
|
-
let currentHeaderTag: string | undefined;
|
|
147
|
+
const ctx = new SectionContext();
|
|
58
148
|
|
|
59
149
|
// Flush current chunk to output. If splitAt is provided, keep words after that index for next chunk.
|
|
60
150
|
const flushChunk = (splitAt?: number) => {
|
|
@@ -64,11 +154,17 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
|
|
|
64
154
|
const wordsToKeep = splitAt !== undefined ? currentChunk.slice(splitAt) : [];
|
|
65
155
|
|
|
66
156
|
if (wordsToFlush.length > 0) {
|
|
157
|
+
const chunkText = wordsToFlush.join(' ').trim();
|
|
158
|
+
const sectionContext = ctx.get();
|
|
159
|
+
|
|
67
160
|
chunks.push({
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
161
|
+
type: 'prose',
|
|
162
|
+
content: chunkText,
|
|
163
|
+
headerId: ctx.headerId,
|
|
164
|
+
headerTag: ctx.headerTag,
|
|
165
|
+
sectionContext: sectionContext || undefined,
|
|
71
166
|
});
|
|
167
|
+
ctx.hasContent = true;
|
|
72
168
|
}
|
|
73
169
|
currentChunk = wordsToKeep;
|
|
74
170
|
};
|
|
@@ -89,18 +185,33 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
|
|
|
89
185
|
|
|
90
186
|
for (const block of blocks) {
|
|
91
187
|
if (block.type === 'header') {
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
188
|
+
flushChunk();
|
|
189
|
+
ctx.header(block);
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Chunk code blocks separately; they tend to be more important.
|
|
194
|
+
if (block.type === 'code') {
|
|
195
|
+
flushChunk();
|
|
196
|
+
const codeText = block.text.trim();
|
|
197
|
+
if (codeText) {
|
|
198
|
+
for (const chunkText of chunkTextByWords(codeText)) {
|
|
199
|
+
chunks.push({
|
|
200
|
+
type: 'prose',
|
|
201
|
+
content: chunkText,
|
|
202
|
+
headerId: ctx.headerId,
|
|
203
|
+
tag: 'code',
|
|
204
|
+
language: block.language,
|
|
205
|
+
sectionContext: ctx.get(),
|
|
206
|
+
});
|
|
207
|
+
ctx.hasContent = true;
|
|
208
|
+
}
|
|
96
209
|
}
|
|
97
|
-
currentHeaderId = block.id;
|
|
98
|
-
currentHeaderTag = block.tag;
|
|
99
|
-
// Include header text at the start of the new chunk
|
|
100
|
-
currentChunk.push(...block.text.split(/\s+/).filter((w) => w.length > 0));
|
|
101
210
|
continue;
|
|
102
211
|
}
|
|
103
212
|
|
|
213
|
+
if (block.type !== 'content') continue;
|
|
214
|
+
|
|
104
215
|
// Split by newlines first to preserve line boundary information
|
|
105
216
|
const lines = block.text.split(/\n/);
|
|
106
217
|
let inCodeBlock = false;
|
|
@@ -108,8 +219,8 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
|
|
|
108
219
|
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
|
|
109
220
|
const line = lines[lineIdx]!;
|
|
110
221
|
|
|
111
|
-
// Track code block boundaries
|
|
112
|
-
if (/^(`{3,}|~{3,})
|
|
222
|
+
// Track code block boundaries (standalone fences only)
|
|
223
|
+
if (/^(`{3,}|~{3,})([a-zA-Z0-9+-]*)?\s*$/.test(line.trim())) {
|
|
113
224
|
inCodeBlock = !inCodeBlock;
|
|
114
225
|
}
|
|
115
226
|
|
|
@@ -155,14 +266,14 @@ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: str
|
|
|
155
266
|
}
|
|
156
267
|
|
|
157
268
|
/**
|
|
158
|
-
* Parses markdown into content blocks, identifying headers and
|
|
159
|
-
*
|
|
269
|
+
* Parses markdown into content blocks, identifying headers, content sections, and code blocks.
|
|
270
|
+
* Code blocks are extracted separately with language metadata for specialized indexing.
|
|
160
271
|
*/
|
|
161
272
|
function parseMarkdown(markdown: string): ContentBlock[] {
|
|
162
273
|
const blocks: ContentBlock[] = [];
|
|
163
274
|
|
|
164
275
|
// Extract title from frontmatter and treat it as h1
|
|
165
|
-
const frontmatterMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
|
|
276
|
+
const frontmatterMatch = markdown.match(/^---\r?\n([\s\S]*?)\r?\n---/);
|
|
166
277
|
if (frontmatterMatch) {
|
|
167
278
|
const frontmatter = frontmatterMatch[1]!;
|
|
168
279
|
const titleMatch = frontmatter.match(/^title:\s*(.+)$/m);
|
|
@@ -178,61 +289,99 @@ function parseMarkdown(markdown: string): ContentBlock[] {
|
|
|
178
289
|
}
|
|
179
290
|
|
|
180
291
|
// Remove frontmatter
|
|
181
|
-
const content = markdown.replace(/^---[\s\S]*?---\n*/, '').trim();
|
|
292
|
+
const content = markdown.replace(/^---[\s\S]*?---\r?\n*/, '').trim();
|
|
182
293
|
|
|
183
294
|
// Split into lines and process
|
|
184
295
|
const lines = content.split('\n');
|
|
185
296
|
let currentContent: string[] = [];
|
|
186
297
|
let inCodeBlock = false;
|
|
298
|
+
let codeBlockLanguage: string | undefined;
|
|
299
|
+
let codeBlockContent: string[] = [];
|
|
187
300
|
|
|
188
301
|
const flushContent = () => {
|
|
189
302
|
const text = currentContent.join('\n').trim();
|
|
190
303
|
if (text) {
|
|
191
|
-
blocks.push({ type: 'content', text });
|
|
304
|
+
blocks.push({ type: 'content', tag: 'p', text });
|
|
192
305
|
}
|
|
193
306
|
currentContent = [];
|
|
194
307
|
};
|
|
195
308
|
|
|
309
|
+
const flushCodeBlock = () => {
|
|
310
|
+
if (codeBlockContent.length > 0) {
|
|
311
|
+
const code = codeBlockContent.join('\n').trim();
|
|
312
|
+
if (code) {
|
|
313
|
+
blocks.push({
|
|
314
|
+
type: 'code',
|
|
315
|
+
tag: 'code',
|
|
316
|
+
text: code,
|
|
317
|
+
language: codeBlockLanguage || undefined,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
codeBlockContent = [];
|
|
322
|
+
codeBlockLanguage = undefined;
|
|
323
|
+
};
|
|
324
|
+
|
|
196
325
|
for (const line of lines) {
|
|
197
326
|
// Track fenced code blocks (``` or ~~~)
|
|
198
327
|
// Only match standalone markers: ```[language] with nothing else on the line
|
|
199
328
|
// This avoids matching inline code blocks in table cells like "``` Then content..."
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
329
|
+
const codeBlockMatch = line.match(/^(`{3,}|~{3,})([a-zA-Z0-9+-]*)?\s*$/);
|
|
330
|
+
if (codeBlockMatch) {
|
|
331
|
+
if (!inCodeBlock) {
|
|
332
|
+
flushContent();
|
|
333
|
+
inCodeBlock = true;
|
|
334
|
+
codeBlockLanguage = codeBlockMatch[2] || undefined;
|
|
335
|
+
} else {
|
|
336
|
+
flushCodeBlock();
|
|
337
|
+
inCodeBlock = false;
|
|
338
|
+
}
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (inCodeBlock) {
|
|
343
|
+
codeBlockContent.push(line);
|
|
203
344
|
continue;
|
|
204
345
|
}
|
|
205
346
|
|
|
206
347
|
// Only match headers outside of code blocks
|
|
207
|
-
|
|
208
|
-
const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
348
|
+
const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
209
349
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
}
|
|
350
|
+
if (headerMatch) {
|
|
351
|
+
flushContent();
|
|
352
|
+
const level = headerMatch[1]!.length;
|
|
353
|
+
const headerText = headerMatch[2]!.trim();
|
|
354
|
+
blocks.push({
|
|
355
|
+
type: 'header',
|
|
356
|
+
tag: `h${level}`,
|
|
357
|
+
id: slugify(headerText),
|
|
358
|
+
text: headerText,
|
|
359
|
+
});
|
|
360
|
+
continue;
|
|
222
361
|
}
|
|
223
362
|
|
|
224
363
|
currentContent.push(line);
|
|
225
364
|
}
|
|
226
365
|
|
|
366
|
+
flushCodeBlock();
|
|
227
367
|
flushContent();
|
|
228
368
|
return blocks;
|
|
229
369
|
}
|
|
230
370
|
|
|
371
|
+
export type IndexEntry = {
|
|
372
|
+
chunk: { id: string; index: number; total: number };
|
|
373
|
+
id: string;
|
|
374
|
+
tag: string;
|
|
375
|
+
content: string;
|
|
376
|
+
language?: string;
|
|
377
|
+
sectionContext?: string;
|
|
378
|
+
};
|
|
379
|
+
|
|
231
380
|
/**
|
|
232
381
|
* Extracts and chunks markdown content for search indexing.
|
|
233
|
-
* Yields
|
|
382
|
+
* Yields prose and code chunks with section context and language metadata.
|
|
234
383
|
*/
|
|
235
|
-
export function* indexMarkdown(markdown: string) {
|
|
384
|
+
export function* indexMarkdown(markdown: string): Generator<IndexEntry> {
|
|
236
385
|
const blocks = parseMarkdown(markdown);
|
|
237
386
|
const chunks = chunkByWords(blocks);
|
|
238
387
|
const documentId = crypto.randomUUID();
|
|
@@ -240,8 +389,10 @@ export function* indexMarkdown(markdown: string) {
|
|
|
240
389
|
for (const [index, chunk] of chunks.entries()) {
|
|
241
390
|
yield {
|
|
242
391
|
id: chunk.headerId ?? '',
|
|
243
|
-
tag: chunk.headerTag ?? '',
|
|
392
|
+
tag: chunk.tag ?? chunk.headerTag ?? '',
|
|
244
393
|
content: chunk.content,
|
|
394
|
+
...(chunk.sectionContext ? { sectionContext: chunk.sectionContext } : {}),
|
|
395
|
+
...(chunk.language ? { language: chunk.language } : {}),
|
|
245
396
|
chunk: {
|
|
246
397
|
id: documentId,
|
|
247
398
|
index,
|
|
@@ -251,64 +402,68 @@ export function* indexMarkdown(markdown: string) {
|
|
|
251
402
|
}
|
|
252
403
|
}
|
|
253
404
|
|
|
254
|
-
|
|
255
|
-
|
|
405
|
+
const DEFAULT_ROOT = 'main';
|
|
406
|
+
const DEFAULT_PATTERN = 'h1, h2, h3, h4, h5, h6, p, li, pre code';
|
|
256
407
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
408
|
+
/**
|
|
409
|
+
* Indexes HTML content for search, with section context and code language extraction.
|
|
410
|
+
*
|
|
411
|
+
* Features:
|
|
412
|
+
* - Tracks header hierarchy to prepend section context (e.g., "Guide > Setup: ...")
|
|
413
|
+
* - Extracts language metadata from code blocks (class="language-js")
|
|
414
|
+
* - Uses word-based chunking with sentence boundary detection
|
|
415
|
+
*/
|
|
416
|
+
export function* indexHTML(
|
|
417
|
+
content: string,
|
|
418
|
+
root = DEFAULT_ROOT,
|
|
419
|
+
pattern = DEFAULT_PATTERN,
|
|
420
|
+
): Generator<IndexEntry> {
|
|
421
|
+
const $ = cheerio.load(content);
|
|
422
|
+
const matches = $(root).find(pattern);
|
|
262
423
|
|
|
263
|
-
|
|
264
|
-
const wordSize = Buffer.byteLength(word + ' ', 'utf-8');
|
|
424
|
+
const ctx = new SectionContext();
|
|
265
425
|
|
|
266
|
-
|
|
267
|
-
|
|
426
|
+
for (const match of matches) {
|
|
427
|
+
const tagName = match.tagName.toLowerCase();
|
|
428
|
+
const rawText = $(match).text().trim();
|
|
268
429
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
430
|
+
if (getHeaderLevel(tagName) > 0) {
|
|
431
|
+
ctx.header({ id: $(match).attr('id') ?? slugify(rawText), tag: tagName, text: rawText });
|
|
432
|
+
continue;
|
|
272
433
|
}
|
|
273
434
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
return chunks;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
export function* indexHTML(content: string, root: string, pattern: string) {
|
|
286
|
-
const $ = cheerio.load(content);
|
|
287
|
-
const matches = $(root).find(pattern);
|
|
435
|
+
// Check if this is a code block and extract language
|
|
436
|
+
const isCode = tagName === 'code' && $(match).parent().is('pre');
|
|
437
|
+
let language: string | undefined;
|
|
438
|
+
if (isCode) {
|
|
439
|
+
const classes = $(match).attr('class') || '';
|
|
440
|
+
const langMatch = classes.match(/(?:language-|lang-)([a-zA-Z0-9+-]+)/);
|
|
441
|
+
language = langMatch ? langMatch[1] : undefined;
|
|
442
|
+
}
|
|
288
443
|
|
|
289
|
-
|
|
290
|
-
const
|
|
291
|
-
const chunks =
|
|
444
|
+
// Build content with section context
|
|
445
|
+
const sectionContext = ctx.get();
|
|
446
|
+
const chunks = chunkTextByWords(rawText);
|
|
292
447
|
const chunkId = crypto.randomUUID();
|
|
293
448
|
|
|
294
|
-
for (const [chunkN,
|
|
449
|
+
for (const [chunkN, chunkText] of chunks.entries()) {
|
|
295
450
|
yield {
|
|
296
|
-
id: $(match).attr('id'),
|
|
297
|
-
tag:
|
|
298
|
-
content,
|
|
451
|
+
id: ctx.headerId ?? $(match).attr('id') ?? chunkId,
|
|
452
|
+
tag: isCode ? 'code' : tagName,
|
|
453
|
+
content: chunkText,
|
|
454
|
+
...(sectionContext ? { sectionContext } : {}),
|
|
455
|
+
...(language && { language }),
|
|
299
456
|
chunk: {
|
|
300
457
|
id: chunkId,
|
|
301
458
|
index: chunkN,
|
|
302
459
|
total: chunks.length,
|
|
303
460
|
},
|
|
304
461
|
};
|
|
462
|
+
ctx.hasContent = true;
|
|
305
463
|
}
|
|
306
464
|
}
|
|
307
465
|
}
|
|
308
466
|
|
|
309
|
-
const root = 'main';
|
|
310
|
-
const pattern = 'h1, h2, h3, h4, h5, h6, p, li';
|
|
311
|
-
|
|
312
467
|
export function stainlessDocsAlgoliaProseIndexing({
|
|
313
468
|
apiReferenceBasePath,
|
|
314
469
|
}: {
|
|
@@ -338,7 +493,7 @@ export function stainlessDocsAlgoliaProseIndexing({
|
|
|
338
493
|
const objects = [];
|
|
339
494
|
for (const absHtmlPath of pagesToRender) {
|
|
340
495
|
const content = await readFile(absHtmlPath, 'utf-8');
|
|
341
|
-
const idx = indexHTML(content
|
|
496
|
+
const idx = indexHTML(content);
|
|
342
497
|
for (const entry of idx)
|
|
343
498
|
objects.push({
|
|
344
499
|
...entry,
|
|
@@ -403,6 +558,7 @@ export function stainlessDocsVectorProseIndexing(
|
|
|
403
558
|
id: string;
|
|
404
559
|
tag: string;
|
|
405
560
|
content: string;
|
|
561
|
+
language?: string;
|
|
406
562
|
kind: 'prose';
|
|
407
563
|
source: string;
|
|
408
564
|
}[] = [];
|
|
@@ -412,7 +568,7 @@ export function stainlessDocsVectorProseIndexing(
|
|
|
412
568
|
|
|
413
569
|
if (markdown) {
|
|
414
570
|
const idx = indexMarkdown(markdown);
|
|
415
|
-
for (const { chunk, ...entry } of idx)
|
|
571
|
+
for (const { chunk: _, ...entry } of idx)
|
|
416
572
|
objects.push({
|
|
417
573
|
...entry,
|
|
418
574
|
kind: 'prose',
|