doc-fetch-cli 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/bin/doc-fetch_darwin_amd64 +0 -0
- package/bin/doc-fetch_windows_amd64.exe +0 -0
- package/doc-fetch +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_darwin_arm64 +0 -0
- package/doc-fetch_linux_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/package.json +1 -1
- package/website/BLOG-SETUP-SUMMARY.md +385 -0
- package/website/DEPLOYMENT.md +189 -0
- package/website/LAUNCH-CHECKLIST.md +134 -0
- package/website/README.md +75 -0
- package/website/SEO-STRATEGY.md +347 -0
- package/website/URL-STRUCTURE.md +334 -0
- package/website/WEBSITE-SUMMARY.md +246 -0
- package/website/package-lock.json +1628 -0
- package/website/package.json +39 -0
- package/website/pnpm-lock.yaml +1061 -0
- package/website/src/app.d.ts +13 -0
- package/website/src/app.html +11 -0
- package/website/src/lib/actions/addCopyButtons.ts +73 -0
- package/website/src/lib/assets/favicon.svg +1 -0
- package/website/src/lib/components/CopyCodeButton.svelte +97 -0
- package/website/src/lib/components/DarkModeToggle.svelte +140 -0
- package/website/src/lib/components/ReadingProgress.svelte +36 -0
- package/website/src/lib/components/RelatedPosts.svelte +151 -0
- package/website/src/lib/components/TableOfContents.svelte +184 -0
- package/website/src/lib/index.ts +1 -0
- package/website/src/lib/posts/convert-docs-to-markdown.md +506 -0
- package/website/src/routes/+layout.svelte +59 -0
- package/website/src/routes/+page.svelte +1033 -0
- package/website/src/routes/about/+page.svelte +607 -0
- package/website/src/routes/blog/+page.svelte +486 -0
- package/website/src/routes/blog/[slug]/+page.svelte +988 -0
- package/website/src/routes/blog/[slug]/+page.ts +53 -0
- package/website/src/routes/sitemap.xml/+server.ts +62 -0
- package/website/static/favicon.svg +10 -0
- package/website/static/og.png +2 -0
- package/website/static/og.svg +26 -0
- package/website/static/robots.txt +43 -0
- package/website/svelte.config.js +13 -0
- package/website/tsconfig.json +20 -0
- package/website/vite.config.ts +6 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
<script lang="ts">
|
|
2
|
+
import { onMount, tick } from 'svelte';
|
|
3
|
+
|
|
4
|
+
export let contentSelector = '.content';
|
|
5
|
+
export let headingSelector = 'h2, h3';
|
|
6
|
+
export let title = 'On this page';
|
|
7
|
+
|
|
8
|
+
let headings: Array<{
|
|
9
|
+
id: string;
|
|
10
|
+
text: string;
|
|
11
|
+
level: number;
|
|
12
|
+
}> = [];
|
|
13
|
+
|
|
14
|
+
let activeId = '';
|
|
15
|
+
let observer: IntersectionObserver | null = null;
|
|
16
|
+
|
|
17
|
+
onMount(() => {
|
|
18
|
+
tick().then(() => {
|
|
19
|
+
extractHeadings();
|
|
20
|
+
setupObserver();
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
return () => {
|
|
24
|
+
if (observer) observer.disconnect();
|
|
25
|
+
};
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
function extractHeadings() {
|
|
29
|
+
const content = document.querySelector(contentSelector);
|
|
30
|
+
if (!content) return;
|
|
31
|
+
|
|
32
|
+
const elements = content.querySelectorAll(headingSelector);
|
|
33
|
+
headings = Array.from(elements).map((el) => {
|
|
34
|
+
// Generate ID if not present
|
|
35
|
+
if (!el.id) {
|
|
36
|
+
el.id = el.textContent
|
|
37
|
+
?.toLowerCase()
|
|
38
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
39
|
+
.replace(/(^-|-$)/g, '') || '';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return {
|
|
43
|
+
id: el.id,
|
|
44
|
+
text: el.textContent || '',
|
|
45
|
+
level: el.tagName === 'H2' ? 2 : 3
|
|
46
|
+
};
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function setupObserver() {
|
|
51
|
+
const options = {
|
|
52
|
+
root: null,
|
|
53
|
+
rootMargin: '-20% 0px -80% 0px',
|
|
54
|
+
threshold: 0
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
observer = new IntersectionObserver((entries) => {
|
|
58
|
+
entries.forEach((entry) => {
|
|
59
|
+
if (entry.isIntersecting) {
|
|
60
|
+
activeId = entry.target.id;
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
}, options);
|
|
64
|
+
|
|
65
|
+
headings.forEach((heading) => {
|
|
66
|
+
const el = document.getElementById(heading.id);
|
|
67
|
+
if (el) observer?.observe(el);
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function scrollToHeading(id: string, event: MouseEvent) {
|
|
72
|
+
event.preventDefault();
|
|
73
|
+
const el = document.getElementById(id);
|
|
74
|
+
if (el) {
|
|
75
|
+
el.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
|
76
|
+
history.pushState(null, '', `#${id}`);
|
|
77
|
+
activeId = id;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
</script>
|
|
81
|
+
|
|
82
|
+
{#if headings.length > 0}
|
|
83
|
+
<nav class="toc" aria-label={title}>
|
|
84
|
+
<h3 class="toc-title">{title}</h3>
|
|
85
|
+
<ul class="toc-list">
|
|
86
|
+
{#each headings as heading}
|
|
87
|
+
<li class="toc-item level-{heading.level}">
|
|
88
|
+
<a
|
|
89
|
+
href={`#${heading.id}`}
|
|
90
|
+
class:active={activeId === heading.id}
|
|
91
|
+
on:click={(e) => scrollToHeading(heading.id, e)}
|
|
92
|
+
>
|
|
93
|
+
{heading.text}
|
|
94
|
+
</a>
|
|
95
|
+
</li>
|
|
96
|
+
{/each}
|
|
97
|
+
</ul>
|
|
98
|
+
</nav>
|
|
99
|
+
{/if}
|
|
100
|
+
|
|
101
|
+
<style>
|
|
102
|
+
.toc {
|
|
103
|
+
position: sticky;
|
|
104
|
+
top: 6rem;
|
|
105
|
+
background: var(--bg-secondary);
|
|
106
|
+
padding: 1.5rem;
|
|
107
|
+
border-radius: 8px;
|
|
108
|
+
border: 1px solid var(--border);
|
|
109
|
+
max-height: calc(100vh - 8rem);
|
|
110
|
+
overflow-y: auto;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
.toc-title {
|
|
114
|
+
font-size: 0.875rem;
|
|
115
|
+
font-weight: 700;
|
|
116
|
+
text-transform: uppercase;
|
|
117
|
+
letter-spacing: 0.05em;
|
|
118
|
+
color: var(--text-muted);
|
|
119
|
+
margin: 0 0 1rem;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
.toc-list {
|
|
123
|
+
list-style: none;
|
|
124
|
+
padding: 0;
|
|
125
|
+
margin: 0;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
.toc-item {
|
|
129
|
+
margin-bottom: 0.5rem;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
.toc-item a {
|
|
133
|
+
display: block;
|
|
134
|
+
padding: 0.5rem 0.75rem;
|
|
135
|
+
color: var(--text-secondary);
|
|
136
|
+
text-decoration: none;
|
|
137
|
+
font-size: 0.9rem;
|
|
138
|
+
line-height: 1.5;
|
|
139
|
+
border-radius: 4px;
|
|
140
|
+
transition: all 0.2s;
|
|
141
|
+
border-left: 2px solid transparent;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
.toc-item a:hover {
|
|
145
|
+
background: rgba(0, 102, 204, 0.05);
|
|
146
|
+
color: var(--accent);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
.toc-item a.active {
|
|
150
|
+
background: rgba(0, 102, 204, 0.1);
|
|
151
|
+
color: var(--accent);
|
|
152
|
+
border-left-color: var(--accent);
|
|
153
|
+
font-weight: 600;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
.toc-item.level-3 a {
|
|
157
|
+
padding-left: 1.5rem;
|
|
158
|
+
font-size: 0.85rem;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/* Scrollbar styling */
|
|
162
|
+
.toc::-webkit-scrollbar {
|
|
163
|
+
width: 6px;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
.toc::-webkit-scrollbar-track {
|
|
167
|
+
background: transparent;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
.toc::-webkit-scrollbar-thumb {
|
|
171
|
+
background: var(--border);
|
|
172
|
+
border-radius: 3px;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
.toc::-webkit-scrollbar-thumb:hover {
|
|
176
|
+
background: var(--text-muted);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
@media (max-width: 1200px) {
|
|
180
|
+
.toc {
|
|
181
|
+
display: none;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
</style>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// place files you want to import through the `$lib` alias in this folder.
|
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
## The Problem: AI Agents Can't Browse Documentation Like Humans
|
|
2
|
+
|
|
3
|
+
When you're working with Large Language Models (LLMs), you've probably hit this wall:
|
|
4
|
+
|
|
5
|
+
**You**: "How do I implement middleware in Fastify?"
|
|
6
|
+
**LLM**: *gives generic answer based on training data from 2024*
|
|
7
|
+
|
|
8
|
+
The real issue? **Your AI agent can't navigate documentation websites.** It can't:
|
|
9
|
+
- Click through sidebar navigation
|
|
10
|
+
- Scroll between related sections
|
|
11
|
+
- Jump from tutorial to API reference
|
|
12
|
+
- Understand the site's information architecture
|
|
13
|
+
|
|
14
|
+
Humans browse docs nonlinearly. LLMs need **complete context in a single prompt**.
|
|
15
|
+
|
|
16
|
+
This guide shows you how to solve that problem by converting entire documentation sites into clean, AI-ready markdown files.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Why Markdown? Why Not Just Paste URLs?
|
|
21
|
+
|
|
22
|
+
### The URL Problem
|
|
23
|
+
|
|
24
|
+
When you give an LLM a URL, here's what happens:
|
|
25
|
+
|
|
26
|
+
1. **Most LLMs can't access live URLs** (security restriction)
|
|
27
|
+
2. **Those that can only fetch one page** (not the whole docs)
|
|
28
|
+
3. **HTML is noisy** (navigation, ads, scripts waste tokens)
|
|
29
|
+
4. **No semantic structure** (LLM can't distinguish API reference from tutorial)
|
|
30
|
+
|
|
31
|
+
### The Markdown Solution
|
|
32
|
+
|
|
33
|
+
Markdown solves all four problems:
|
|
34
|
+
|
|
35
|
+
```markdown
|
|
36
|
+
# Fastify Middleware Guide
|
|
37
|
+
|
|
38
|
+
## Understanding Hooks
|
|
39
|
+
|
|
40
|
+
Fastify hooks are lifecycle methods that execute at specific points...
|
|
41
|
+
|
|
42
|
+
## onRequest Hook
|
|
43
|
+
|
|
44
|
+
The `onRequest` hook fires before the request is processed...
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
✅ **Clean format** - No HTML bloat
|
|
48
|
+
✅ **Semantic structure** - Headers show hierarchy
|
|
49
|
+
✅ **Token efficient** - Only content, no navigation
|
|
50
|
+
✅ **Version control friendly** - Diffable, searchable
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Step-by-Step: Converting Docs to Markdown
|
|
55
|
+
|
|
56
|
+
### Method 1: Manual Copy-Paste (Don't Do This)
|
|
57
|
+
|
|
58
|
+
**Process:**
|
|
59
|
+
1. Open documentation site
|
|
60
|
+
2. Select all text on page
|
|
61
|
+
3. Copy to clipboard
|
|
62
|
+
4. Paste into document
|
|
63
|
+
5. Repeat for every page
|
|
64
|
+
6. Manually organize sections
|
|
65
|
+
|
|
66
|
+
**Time required:** 2-3 hours for medium-sized docs
|
|
67
|
+
**Error rate:** High (missed pages, inconsistent formatting)
|
|
68
|
+
**Verdict:** ❌ Not scalable
|
|
69
|
+
|
|
70
|
+
### Method 2: Browser Extensions (Limited)
|
|
71
|
+
|
|
72
|
+
Tools like **Mercury Parser** or **Readability** can extract article content:
|
|
73
|
+
|
|
74
|
+
**Pros:**
|
|
75
|
+
- Quick for single pages
|
|
76
|
+
- Removes navigation automatically
|
|
77
|
+
|
|
78
|
+
**Cons:**
|
|
79
|
+
- One page at a time
|
|
80
|
+
- Inconsistent results across sites
|
|
81
|
+
- No batch processing
|
|
82
|
+
|
|
83
|
+
**Verdict:** ⚠️ Okay for occasional use, not for complete docs
|
|
84
|
+
|
|
85
|
+
### Method 3: Automated Documentation Fetchers (Recommended)
|
|
86
|
+
|
|
87
|
+
Specialized tools designed for this exact problem.
|
|
88
|
+
|
|
89
|
+
#### Option A: DocFetch (Full Disclosure: I Built This)
|
|
90
|
+
|
|
91
|
+
**What it does:**
|
|
92
|
+
- Crawls entire documentation site
|
|
93
|
+
- Extracts clean content from each page
|
|
94
|
+
- Converts to structured markdown
|
|
95
|
+
- Generates semantic index (llm.txt)
|
|
96
|
+
- Outputs single file with all docs
|
|
97
|
+
|
|
98
|
+
**Installation:**
|
|
99
|
+
```bash
|
|
100
|
+
npm install -g doc-fetch
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Usage:**
|
|
104
|
+
```bash
|
|
105
|
+
doc-fetch --url https://fastify.dev/docs/latest/ \
|
|
106
|
+
--output ./fastify-docs.md \
|
|
107
|
+
--depth 4 \
|
|
108
|
+
--llm-txt
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Output:**
|
|
112
|
+
- `fastify-docs.md` - Complete documentation (all pages)
|
|
113
|
+
- `fastify-docs.llm.txt` - Semantic index for AI navigation
|
|
114
|
+
|
|
115
|
+
**Time required:** 2-5 minutes
|
|
116
|
+
**Verdict:** ✅ Best for production use
|
|
117
|
+
|
|
118
|
+
#### Option B: Custom Script (For Control Freaks)
|
|
119
|
+
|
|
120
|
+
If you want full control, build your own scraper:
|
|
121
|
+
|
|
122
|
+
**Tech stack:**
|
|
123
|
+
- Node.js + Puppeteer (or Playwright)
|
|
124
|
+
- Cheerio for HTML parsing
|
|
125
|
+
- Turndown for HTML→Markdown conversion
|
|
126
|
+
|
|
127
|
+
**Basic implementation:**
|
|
128
|
+
```javascript
|
|
129
|
+
import puppeteer from 'puppeteer';
|
|
130
|
+
import TurndownService from 'turndown';
|
|
131
|
+
|
|
132
|
+
const turndown = new TurndownService();
|
|
133
|
+
|
|
134
|
+
async function fetchDocs(baseUrl) {
|
|
135
|
+
const browser = await puppeteer.launch();
|
|
136
|
+
const page = await browser.newPage();
|
|
137
|
+
|
|
138
|
+
// Crawl logic here...
|
|
139
|
+
// Extract content, convert to markdown
|
|
140
|
+
// Save to file
|
|
141
|
+
|
|
142
|
+
await browser.close();
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Time required:** 4-8 hours to build, 5 min per run
|
|
147
|
+
**Verdict:** ⚠️ Only if you have specific needs
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Advanced: Generating LLM.txt Index
|
|
152
|
+
|
|
153
|
+
Here's where it gets powerful.
|
|
154
|
+
|
|
155
|
+
### What is LLM.txt?
|
|
156
|
+
|
|
157
|
+
It's a **semantic roadmap** for your AI agents:
|
|
158
|
+
|
|
159
|
+
```txt
|
|
160
|
+
# llm.txt - Fastify Documentation Index
|
|
161
|
+
|
|
162
|
+
[GUIDE] Getting Started
|
|
163
|
+
https://fastify.dev/docs/latest/guides/
|
|
164
|
+
Covers installation, quick start, and first server.
|
|
165
|
+
|
|
166
|
+
[API] Server Options
|
|
167
|
+
https://fastify.dev/docs/latest/Reference/Server/
|
|
168
|
+
Complete API reference for Fastify server configuration.
|
|
169
|
+
|
|
170
|
+
[TUTORIAL] Building a REST API
|
|
171
|
+
https://fastify.dev/docs/latest/guides/rest-api/
|
|
172
|
+
Step-by-step tutorial for creating REST APIs with Fastify.
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Why This Matters
|
|
176
|
+
|
|
177
|
+
Without llm.txt:
|
|
178
|
+
```
|
|
179
|
+
You: "How do I add middleware?"
|
|
180
|
+
LLM: *searches entire 500KB docs file, might miss relevant section*
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
With llm.txt:
|
|
184
|
+
```
|
|
185
|
+
You: "Check the [API] Hooks section in llm.txt"
|
|
186
|
+
LLM: *jumps directly to onRequest/onSend hook documentation*
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### How to Generate LLM.txt
|
|
190
|
+
|
|
191
|
+
**With DocFetch:**
|
|
192
|
+
```bash
|
|
193
|
+
doc-fetch --url https://docs.example.com --llm-txt
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
**Manual approach:**
|
|
197
|
+
1. Categorize each page (Guide, API Reference, Tutorial, Example)
|
|
198
|
+
2. Write 1-sentence description
|
|
199
|
+
3. Preserve original URL
|
|
200
|
+
4. Format as shown above
|
|
201
|
+
|
|
202
|
+
**Pro tip:** Use AI to help categorize! Feed it page titles and ask for classification.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Cleaning Strategies for Better Results
|
|
207
|
+
|
|
208
|
+
Not all documentation is created equal. Here's how to handle common issues:
|
|
209
|
+
|
|
210
|
+
### Problem 1: Navigation Leakage
|
|
211
|
+
|
|
212
|
+
**Symptom:** Your markdown includes "Home > Docs > Guide" breadcrumbs
|
|
213
|
+
|
|
214
|
+
**Solution:** Configure your fetcher to exclude common nav selectors:
|
|
215
|
+
```css
|
|
216
|
+
nav, .breadcrumb, .sidebar, .table-of-contents, footer
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### Problem 2: Code Block Formatting
|
|
220
|
+
|
|
221
|
+
**Symptom:** Code examples lose syntax highlighting or indentation
|
|
222
|
+
|
|
223
|
+
**Solution:** Ensure your converter preserves `<pre><code>` blocks:
|
|
224
|
+
```javascript
|
|
225
|
+
turndown.addRule('codeBlock', {
|
|
226
|
+
filter: ['pre'],
|
|
227
|
+
replacement: (content, node) => {
|
|
228
|
+
const code = node.querySelector('code');
|
|
229
|
+
const language = code?.className || '';
|
|
230
|
+
return `\`\`\`${language}\n${code?.textContent || content}\n\`\`\``;
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Problem 3: Relative Links Broken
|
|
236
|
+
|
|
237
|
+
**Symptom:** Links like `./guide.md` don't work
|
|
238
|
+
|
|
239
|
+
**Solution:** Convert to absolute URLs during fetch:
|
|
240
|
+
```javascript
|
|
241
|
+
const absoluteUrl = new URL(relativePath, baseUrl).href;
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Problem 4: Duplicate Content
|
|
245
|
+
|
|
246
|
+
**Symptom:** Same content appears on multiple pages (common with intro sections)
|
|
247
|
+
|
|
248
|
+
**Solution:** Deduplicate during merge:
|
|
249
|
+
- Hash each section
|
|
250
|
+
- Skip if hash already seen
|
|
251
|
+
- Keep first occurrence
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Real-World Examples
|
|
256
|
+
|
|
257
|
+
### Example 1: React Documentation
|
|
258
|
+
|
|
259
|
+
**Command:**
|
|
260
|
+
```bash
|
|
261
|
+
doc-fetch --url https://react.dev/learn \
|
|
262
|
+
--output react-learn.md \
|
|
263
|
+
--depth 3 \
|
|
264
|
+
--concurrent 10 \
|
|
265
|
+
--llm-txt
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**Result:**
|
|
269
|
+
- 127 pages converted
|
|
270
|
+
- 450KB markdown file
|
|
271
|
+
- 89 entries in llm.txt
|
|
272
|
+
- Time: 3 minutes
|
|
273
|
+
|
|
274
|
+
**Use case:** Give your AI agent complete React knowledge for debugging components
|
|
275
|
+
|
|
276
|
+
### Example 2: Go Documentation
|
|
277
|
+
|
|
278
|
+
**Command:**
|
|
279
|
+
```bash
|
|
280
|
+
doc-fetch --url https://golang.org/doc/ \
|
|
281
|
+
--output go-docs.md \
|
|
282
|
+
--depth 4 \
|
|
283
|
+
--llm-txt
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
**Result:**
|
|
287
|
+
- Language spec + tutorials + API docs
|
|
288
|
+
- Single file for AI context
|
|
289
|
+
- Preserves links to pkg.go.dev
|
|
290
|
+
|
|
291
|
+
**Use case:** AI-assisted Go development with up-to-date knowledge
|
|
292
|
+
|
|
293
|
+
### Example 3: Your Own Project Docs
|
|
294
|
+
|
|
295
|
+
**Command:**
|
|
296
|
+
```bash
|
|
297
|
+
doc-fetch --url https://your-project.com/docs \
|
|
298
|
+
--output internal/project-knowledge.md \
|
|
299
|
+
--llm-txt
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
**Use case:**
|
|
303
|
+
- Onboard new team members faster
|
|
304
|
+
- Create AI chatbot for internal support
|
|
305
|
+
- Build searchable knowledge base
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## Token Optimization Tips
|
|
310
|
+
|
|
311
|
+
Documentation can be large. Here's how to reduce token usage:
|
|
312
|
+
|
|
313
|
+
### Tip 1: Exclude Irrelevant Sections
|
|
314
|
+
|
|
315
|
+
Not all docs are useful for your use case:
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
# Skip changelog and community pages
|
|
319
|
+
doc-fetch --exclude "/changelog/*" --exclude "/community/*"
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Tip 2: Use Depth Limits
|
|
323
|
+
|
|
324
|
+
Don't fetch the entire internet:
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# Only 2 levels deep (usually enough for core docs)
|
|
328
|
+
doc-fetch --depth 2
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### Tip 3: Compress Descriptions in LLM.txt
|
|
332
|
+
|
|
333
|
+
Instead of:
|
|
334
|
+
```txt
|
|
335
|
+
[GUIDE] Getting Started - This comprehensive guide covers everything you need to know about installing Node.js on various operating systems including Windows, macOS, and Linux distributions...
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
Write:
|
|
339
|
+
```txt
|
|
340
|
+
[GUIDE] Getting Started - Install Node.js on Windows, macOS, Linux
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
### Tip 4: Split by Topic
|
|
344
|
+
|
|
345
|
+
Instead of one massive file:
|
|
346
|
+
|
|
347
|
+
```bash
|
|
348
|
+
doc-fetch --url https://docs.example.com/api --output api-docs.md
|
|
349
|
+
doc-fetch --url https://docs.example.com/guides --output guides.md
|
|
350
|
+
doc-fetch --url https://docs.example.com/tutorials --output tutorials.md
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
Then load only what you need per session.
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## Common Mistakes to Avoid
|
|
358
|
+
|
|
359
|
+
### ❌ Mistake 1: Fetching Without Checking robots.txt
|
|
360
|
+
|
|
361
|
+
**Problem:** You might violate the site's crawling policy
|
|
362
|
+
|
|
363
|
+
**Solution:** Always check `https://example.com/robots.txt` first
|
|
364
|
+
|
|
365
|
+
```bash
|
|
366
|
+
curl https://example.com/robots.txt
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
DocFetch respects robots.txt by default.
|
|
370
|
+
|
|
371
|
+
### ❌ Mistake 2: Aggressive Concurrency
|
|
372
|
+
|
|
373
|
+
**Problem:** Overwhelming the server with 50 concurrent requests
|
|
374
|
+
|
|
375
|
+
**Solution:** Use reasonable concurrency:
|
|
376
|
+
|
|
377
|
+
```bash
|
|
378
|
+
# Good: 5-10 concurrent requests
|
|
379
|
+
doc-fetch --concurrent 8
|
|
380
|
+
|
|
381
|
+
# Bad: Don't do this
|
|
382
|
+
doc-fetch --concurrent 50 # 💀
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### ❌ Mistake 3: Not Preserving Source URLs
|
|
386
|
+
|
|
387
|
+
**Problem:** Your AI can't verify information or read updates
|
|
388
|
+
|
|
389
|
+
**Solution:** Always keep original URLs in comments:
|
|
390
|
+
|
|
391
|
+
```markdown
|
|
392
|
+
## Installation
|
|
393
|
+
<!-- Source: https://docs.example.com/install -->
|
|
394
|
+
|
|
395
|
+
Run the installer...
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
### ❌ Mistake 4: Using Outdated Docs
|
|
399
|
+
|
|
400
|
+
**Problem:** Documentation changes. Your markdown becomes stale.
|
|
401
|
+
|
|
402
|
+
**Solution:**
|
|
403
|
+
- Add fetch date to output
|
|
404
|
+
- Re-fetch monthly (or when major version releases)
|
|
405
|
+
- Version your documentation files
|
|
406
|
+
|
|
407
|
+
```bash
|
|
408
|
+
doc-fetch --url ... --output docs-v2.4.md
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
---
|
|
412
|
+
|
|
413
|
+
## Measuring Success
|
|
414
|
+
|
|
415
|
+
How do you know if your documentation conversion is working?
|
|
416
|
+
|
|
417
|
+
### Metrics to Track
|
|
418
|
+
|
|
419
|
+
1. **Answer Quality Score** (1-5)
|
|
420
|
+
- Before: LLM gives generic answers
|
|
421
|
+
- After: LLM references specific API details
|
|
422
|
+
|
|
423
|
+
2. **Hallucination Rate**
|
|
424
|
+
- Count false claims per 10 queries
|
|
425
|
+
- Should drop significantly with complete context
|
|
426
|
+
|
|
427
|
+
3. **Token Efficiency**
|
|
428
|
+
- Tokens used per successful answer
|
|
429
|
+
- Optimize cleaning to reduce waste
|
|
430
|
+
|
|
431
|
+
4. **Response Time**
|
|
432
|
+
- Time from query to answer
|
|
433
|
+
- Smaller files = faster retrieval
|
|
434
|
+
|
|
435
|
+
### Before/After Example
|
|
436
|
+
|
|
437
|
+
**Before (URL-only):**
|
|
438
|
+
```
|
|
439
|
+
Q: "How do I add rate limiting in Fastify?"
|
|
440
|
+
A: "You can use middleware like express-rate-limit..."
|
|
441
|
+
❌ Wrong framework, hallucinated
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
**After (Full docs + llm.txt):**
|
|
445
|
+
```
|
|
446
|
+
Q: "How do I add rate limiting in Fastify?"
|
|
447
|
+
A: "Fastify has @fastify/rate-limit plugin. Install with npm install @fastify/rate-limit, then register:
|
|
448
|
+
|
|
449
|
+
app.register(require('@fastify/rate-limit'), {
|
|
450
|
+
max: 100,
|
|
451
|
+
timeWindow: '1 minute'
|
|
452
|
+
})"
|
|
453
|
+
|
|
454
|
+
Source: [PLUGIN] Rate Limit section in llm.txt
|
|
455
|
+
✅ Correct, specific, verifiable
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
---
|
|
459
|
+
|
|
460
|
+
## Next Steps
|
|
461
|
+
|
|
462
|
+
You now have everything needed to convert documentation into AI-ready markdown.
|
|
463
|
+
|
|
464
|
+
### Quick Start Checklist
|
|
465
|
+
|
|
466
|
+
- [ ] Install DocFetch: `npm install -g doc-fetch`
|
|
467
|
+
- [ ] Test with small docs site (depth 2)
|
|
468
|
+
- [ ] Review output quality
|
|
469
|
+
- [ ] Adjust cleaning/exclusion rules
|
|
470
|
+
- [ ] Generate llm.txt index
|
|
471
|
+
- [ ] Test with your LLM of choice
|
|
472
|
+
- [ ] Iterate based on answer quality
|
|
473
|
+
|
|
474
|
+
### Advanced Projects
|
|
475
|
+
|
|
476
|
+
Once comfortable:
|
|
477
|
+
1. **Build automated refresh pipeline** (cron job, weekly re-fetch)
|
|
478
|
+
2. **Create multi-project knowledge base** (combine multiple docs)
|
|
479
|
+
3. **Implement semantic search** (embeddings + vector DB)
|
|
480
|
+
4. **Build AI chatbot** on top of your documentation
|
|
481
|
+
|
|
482
|
+
### Join the Community
|
|
483
|
+
|
|
484
|
+
- Share your use cases on [GitHub](https://github.com/AlphaTechini/doc-fetch)
|
|
485
|
+
- Report bugs or request features
|
|
486
|
+
- Contribute improvements
|
|
487
|
+
|
|
488
|
+
---
|
|
489
|
+
|
|
490
|
+
## TL;DR
|
|
491
|
+
|
|
492
|
+
1. **Problem:** LLMs can't browse documentation websites
|
|
493
|
+
2. **Solution:** Convert entire docs to single-file markdown
|
|
494
|
+
3. **Best tool:** DocFetch (automated, generates llm.txt index)
|
|
495
|
+
4. **Command:** `doc-fetch --url https://docs.site --llm-txt`
|
|
496
|
+
5. **Result:** AI agents with complete, navigable documentation knowledge
|
|
497
|
+
|
|
498
|
+
Stop copying documentation manually. Start building AI agents with full context.
|
|
499
|
+
|
|
500
|
+
**Try it now:**
|
|
501
|
+
```bash
|
|
502
|
+
npm install -g doc-fetch
|
|
503
|
+
doc-fetch --url https://golang.org/doc/ --output go-docs.md --llm-txt
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
Your AI agents will thank you. 🚀
|