@stainless-api/docs 0.1.0-beta.62 → 0.1.0-beta.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # @stainless-api/docs
2
2
 
3
+ ## 0.1.0-beta.64
4
+
5
+ ### Patch Changes
6
+
7
+ - add markdown indexing functionality
8
+
9
+ ## 0.1.0-beta.63
10
+
11
+ ### Patch Changes
12
+
13
+ - d2e3686: add a success state to the copy markdown action
14
+ - Updated dependencies [5d6e5fa]
15
+ - @stainless-api/ui-primitives@0.1.0-beta.39
16
+ - @stainless-api/docs-ui@0.1.0-beta.52
17
+ - @stainless-api/docs-search@0.1.0-beta.4
18
+
3
19
  ## 0.1.0-beta.62
4
20
 
5
21
  ### Patch Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@stainless-api/docs",
3
- "version": "0.1.0-beta.62",
3
+ "version": "0.1.0-beta.64",
4
4
  "publishConfig": {
5
5
  "access": "public"
6
6
  },
@@ -52,9 +52,9 @@
52
52
  "vite-plugin-prebundle-workers": "^0.2.0",
53
53
  "web-worker": "^1.5.0",
54
54
  "yaml": "^2.8.2",
55
- "@stainless-api/docs-ui": "0.1.0-beta.51",
56
- "@stainless-api/docs-search": "0.1.0-beta.3",
57
- "@stainless-api/ui-primitives": "0.1.0-beta.38"
55
+ "@stainless-api/docs-search": "0.1.0-beta.4",
56
+ "@stainless-api/ui-primitives": "0.1.0-beta.39",
57
+ "@stainless-api/docs-ui": "0.1.0-beta.52"
58
58
  },
59
59
  "devDependencies": {
60
60
  "@astrojs/check": "^0.9.6",
@@ -226,8 +226,16 @@ export function wireAIDropdown() {
226
226
  onSelect: (value) => {
227
227
  triggerOption(value);
228
228
  },
229
- onPrimaryAction: () => {
229
+ onPrimaryAction: (el) => {
230
230
  triggerOption(primaryAction.id);
231
+ const span = el.querySelector('[data-part="primary-action-text"]');
232
+ if (span) {
233
+ const originalContent = span.textContent;
234
+ span.textContent = 'Copied!';
235
+ setTimeout(() => {
236
+ span.textContent = originalContent;
237
+ }, 2000);
238
+ }
231
239
  },
232
240
  });
233
241
  });
@@ -5,7 +5,250 @@ import { bold } from '../shared/terminalUtils';
5
5
  import { buildProseIndex } from '@stainless-api/docs-search/providers/algolia';
6
6
  import * as cheerio from 'cheerio';
7
7
 
8
- function chunkByWords(content: string, chunkSize: number = 30000, chunkOverlap: number = 10) {
8
+ interface ContentBlock {
9
+ type: 'header' | 'content';
10
+ tag?: string;
11
+ id?: string;
12
+ text: string;
13
+ }
14
+
15
+ // Chunking configuration
16
+ // We target 64-256 tokens per chunk, using ~1.3 tokens/word for English text
17
+ const TOKENS_PER_WORD = 1.3;
18
+ const MIN_TOKENS = 64;
19
+ const MAX_TOKENS = 256;
20
+ const MIN_WORDS = Math.floor(MIN_TOKENS / TOKENS_PER_WORD); // ~49 words
21
+ const MAX_WORDS = Math.floor(MAX_TOKENS / TOKENS_PER_WORD); // ~197 words
22
+ const LINE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.75) / TOKENS_PER_WORD); // ~148 words
23
+ const SENTENCE_BREAK_WORDS = Math.floor((MAX_TOKENS * 0.875) / TOKENS_PER_WORD); // ~172 words
24
+
25
+ // Generate a URL-safe ID from header text (e.g., "OpenAPI Config" -> "openapi-config")
26
+ function slugify(text: string): string {
27
+ return text
28
+ .toLowerCase()
29
+ .replace(/`/g, '') // Remove backticks
30
+ .replace(/[^a-z0-9]+/g, '-') // Replace non-alphanumeric with hyphens
31
+ .replace(/^-|-$/g, ''); // Trim leading/trailing hyphens
32
+ }
33
+
34
+ // Check if a word ends with a real table cell boundary (| but not escaped \|)
35
+ function isTableCellBoundary(word: string): boolean {
36
+ return word.endsWith('|') && !word.endsWith('\\|');
37
+ }
38
+
39
+ /**
40
+ * Chunks content blocks into segments of 64-256 tokens.
41
+ *
42
+ * Chunking strategy:
43
+ * 1. Break at headers if chunk has >= MIN_WORDS, otherwise merge with next section
44
+ * 2. Prefer breaking at line/table boundaries after LINE_BREAK_WORDS (~148 words / ~192 tokens)
45
+ * 3. Break at sentence endings after SENTENCE_BREAK_WORDS (~172 words / ~224 tokens)
46
+ * 4. Force break at MAX_WORDS, preferring table row boundaries if available
47
+ * 5. Header context (id/tag) is preserved for continuation chunks
48
+ */
49
+ function chunkByWords(blocks: ContentBlock[]): { content: string; headerId?: string; headerTag?: string }[] {
50
+ const chunks: { content: string; headerId?: string; headerTag?: string }[] = [];
51
+
52
+ let currentChunk: string[] = [];
53
+ let currentHeaderId: string | undefined;
54
+ let currentHeaderTag: string | undefined;
55
+
56
+ // Flush current chunk to output. If splitAt is provided, keep words after that index for next chunk.
57
+ const flushChunk = (splitAt?: number) => {
58
+ if (currentChunk.length === 0) return;
59
+
60
+ const wordsToFlush = splitAt !== undefined ? currentChunk.slice(0, splitAt) : currentChunk;
61
+ const wordsToKeep = splitAt !== undefined ? currentChunk.slice(splitAt) : [];
62
+
63
+ if (wordsToFlush.length > 0) {
64
+ chunks.push({
65
+ content: wordsToFlush.join(' ').trim(),
66
+ headerId: currentHeaderId,
67
+ headerTag: currentHeaderTag,
68
+ });
69
+ }
70
+ currentChunk = wordsToKeep;
71
+ };
72
+
73
+ // Find a table row boundary to break at (between MIN_WORDS and current length)
74
+ // Returns the index to split at, or undefined if no good boundary found
75
+ const findTableRowBoundary = (): number | undefined => {
76
+ for (let i = currentChunk.length - 1; i >= MIN_WORDS; i--) {
77
+ const word = currentChunk[i]!;
78
+ const nextWord = currentChunk[i + 1];
79
+ // A row boundary is where one cell ends (|) and the next row starts (|)
80
+ if (isTableCellBoundary(word) && nextWord?.startsWith('|')) {
81
+ return i + 1;
82
+ }
83
+ }
84
+ return undefined;
85
+ };
86
+
87
+ for (const block of blocks) {
88
+ if (block.type === 'header') {
89
+ // Flush at header boundaries only if chunk meets minimum size
90
+ // This avoids creating tiny chunks for headers with little content
91
+ if (currentChunk.length >= MIN_WORDS) {
92
+ flushChunk();
93
+ }
94
+ currentHeaderId = block.id;
95
+ currentHeaderTag = block.tag;
96
+ // Include header text at the start of the new chunk
97
+ currentChunk.push(...block.text.split(/\s+/).filter((w) => w.length > 0));
98
+ continue;
99
+ }
100
+
101
+ // Split by newlines first to preserve line boundary information
102
+ const lines = block.text.split(/\n/);
103
+ let inCodeBlock = false;
104
+
105
+ for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
106
+ const line = lines[lineIdx]!;
107
+
108
+ // Track code block boundaries
109
+ if (/^(`{3,}|~{3,})/.test(line.trim())) {
110
+ inCodeBlock = !inCodeBlock;
111
+ }
112
+
113
+ // Calculate indentation level (number of leading spaces, treating tabs as 2 spaces)
114
+ const indentMatch = line.match(/^(\s*)/);
115
+ const indentLevel = indentMatch ? indentMatch[1]!.replace(/\t/g, ' ').length : 0;
116
+
117
+ const words = line.split(/\s+/).filter((w) => w.length > 0);
118
+ const isLastLine = lineIdx === lines.length - 1;
119
+
120
+ for (let wordIdx = 0; wordIdx < words.length; wordIdx++) {
121
+ const word = words[wordIdx]!;
122
+ const isEndOfLine = wordIdx === words.length - 1 && !isLastLine;
123
+
124
+ if (currentChunk.length >= MAX_WORDS) {
125
+ flushChunk(findTableRowBoundary());
126
+ }
127
+
128
+ currentChunk.push(word);
129
+
130
+ // In code blocks, avoid early flushes to keep blocks together
131
+ // - Light indentation (2+ spaces): require more words before flushing
132
+ // - Deep indentation (4+ spaces): skip early flushes entirely
133
+ const inShallowCode = inCodeBlock && indentLevel >= 2 && indentLevel < 4;
134
+ const inDeepCode = inCodeBlock && indentLevel >= 4;
135
+
136
+ // Flush early at natural break points
137
+ const len = currentChunk.length;
138
+ const atTableBreak = len >= LINE_BREAK_WORDS && isTableCellBoundary(word);
139
+ // Shallow code: only flush at sentence threshold; Deep code: don't flush early
140
+ const lineBreakThreshold = inShallowCode ? SENTENCE_BREAK_WORDS : LINE_BREAK_WORDS;
141
+ const atLineBreak = len >= lineBreakThreshold && isEndOfLine && !inDeepCode;
142
+ const atSentenceBreak = len >= SENTENCE_BREAK_WORDS && /[.!?]["']?$/.test(word) && !inDeepCode;
143
+ if (atTableBreak || atLineBreak || atSentenceBreak) {
144
+ flushChunk();
145
+ }
146
+ }
147
+ }
148
+ }
149
+
150
+ flushChunk();
151
+ return chunks;
152
+ }
153
+
154
+ /**
155
+ * Parses markdown into content blocks, identifying headers and content sections.
156
+ * Tracks fenced code blocks to avoid treating # comments in code as headers.
157
+ */
158
+ function parseMarkdown(markdown: string): ContentBlock[] {
159
+ const blocks: ContentBlock[] = [];
160
+
161
+ // Extract title from frontmatter and treat it as h1
162
+ const frontmatterMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
163
+ if (frontmatterMatch) {
164
+ const frontmatter = frontmatterMatch[1]!;
165
+ const titleMatch = frontmatter.match(/^title:\s*(.+)$/m);
166
+ if (titleMatch) {
167
+ const title = titleMatch[1]!.trim().replace(/^["']|["']$/g, ''); // Remove quotes if present
168
+ blocks.push({
169
+ type: 'header',
170
+ tag: 'h1',
171
+ id: slugify(title),
172
+ text: title,
173
+ });
174
+ }
175
+ }
176
+
177
+ // Remove frontmatter
178
+ const content = markdown.replace(/^---[\s\S]*?---\n*/, '').trim();
179
+
180
+ // Split into lines and process
181
+ const lines = content.split('\n');
182
+ let currentContent: string[] = [];
183
+ let inCodeBlock = false;
184
+
185
+ const flushContent = () => {
186
+ const text = currentContent.join('\n').trim();
187
+ if (text) {
188
+ blocks.push({ type: 'content', text });
189
+ }
190
+ currentContent = [];
191
+ };
192
+
193
+ for (const line of lines) {
194
+ // Track fenced code blocks (``` or ~~~)
195
+ // Only match standalone markers: ```[language] with nothing else on the line
196
+ // This avoids matching inline code blocks in table cells like "``` Then content..."
197
+ if (/^(`{3,}|~{3,})([a-zA-Z0-9]*)?(\s*)$/.test(line)) {
198
+ inCodeBlock = !inCodeBlock;
199
+ currentContent.push(line);
200
+ continue;
201
+ }
202
+
203
+ // Only match headers outside of code blocks
204
+ if (!inCodeBlock) {
205
+ const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
206
+
207
+ if (headerMatch) {
208
+ flushContent();
209
+ const level = headerMatch[1]!.length;
210
+ const headerText = headerMatch[2]!.trim();
211
+ blocks.push({
212
+ type: 'header',
213
+ tag: `h${level}`,
214
+ id: slugify(headerText),
215
+ text: headerText,
216
+ });
217
+ continue;
218
+ }
219
+ }
220
+
221
+ currentContent.push(line);
222
+ }
223
+
224
+ flushContent();
225
+ return blocks;
226
+ }
227
+
228
+ /**
229
+ * Extracts and chunks markdown content for search indexing.
230
+ * Yields chunk objects with content, header context, and chunk metadata.
231
+ */
232
+ export function* indexMarkdown(markdown: string) {
233
+ const blocks = parseMarkdown(markdown);
234
+ const chunks = chunkByWords(blocks);
235
+ const documentId = crypto.randomUUID();
236
+
237
+ for (const [index, chunk] of chunks.entries()) {
238
+ yield {
239
+ id: chunk.headerId ?? '',
240
+ tag: chunk.headerTag ?? '',
241
+ content: chunk.content,
242
+ chunk: {
243
+ id: documentId,
244
+ index,
245
+ total: chunks.length,
246
+ },
247
+ };
248
+ }
249
+ }
250
+
251
+ function chunkHTMLByWords(content: string, chunkSize: number = 30000, chunkOverlap: number = 10) {
9
252
  if (Buffer.byteLength(content) < chunkSize) return [content];
10
253
 
11
254
  const words = content.split(/\s+/);
@@ -42,7 +285,7 @@ export function* indexHTML(content: string, root: string, pattern: string) {
42
285
 
43
286
  for (const match of matches) {
44
287
  const rawText = $(match).text().trim();
45
- const chunks = chunkByWords(rawText);
288
+ const chunks = chunkHTMLByWords(rawText);
46
289
  const chunkId = crypto.randomUUID();
47
290
 
48
291
  for (const [chunkN, content] of chunks.entries()) {