doc-fetch-cli 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -0
- package/SECURITY.md +84 -0
- package/bin/doc-fetch.js +37 -0
- package/bin/install.js +171 -0
- package/cmd/docfetch/main.go +54 -0
- package/dist/doc_fetch-1.0.1-py3-none-any.whl +0 -0
- package/dist/doc_fetch-1.0.1.tar.gz +0 -0
- package/doc-fetch +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_linux_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/doc_fetch/__init__.py +6 -0
- package/doc_fetch/__main__.py +7 -0
- package/doc_fetch/cli.py +113 -0
- package/doc_fetch.egg-info/PKG-INFO +224 -0
- package/doc_fetch.egg-info/SOURCES.txt +19 -0
- package/doc_fetch.egg-info/dependency_links.txt +1 -0
- package/doc_fetch.egg-info/entry_points.txt +2 -0
- package/doc_fetch.egg-info/not-zip-safe +1 -0
- package/doc_fetch.egg-info/top_level.txt +1 -0
- package/docs/usage.md +67 -0
- package/examples/golang-example.sh +12 -0
- package/go.mod +11 -0
- package/go.sum +38 -0
- package/package.json +18 -0
- package/pkg/fetcher/classifier.go +50 -0
- package/pkg/fetcher/describer.go +61 -0
- package/pkg/fetcher/fetcher.go +332 -0
- package/pkg/fetcher/html2md.go +71 -0
- package/pkg/fetcher/llmtxt.go +36 -0
- package/pkg/fetcher/validator.go +109 -0
- package/pkg/fetcher/writer.go +32 -0
- package/pyproject.toml +37 -0
- package/setup.py +158 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc-fetch
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
|
|
5
|
+
Home-page: https://github.com/AlphaTechini/doc-fetch
|
|
6
|
+
Author: AlphaTechini
|
|
7
|
+
Author-email: AlphaTechini <rehobothokoibu@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/AlphaTechini/doc-fetch
|
|
10
|
+
Project-URL: Repository, https://github.com/AlphaTechini/doc-fetch
|
|
11
|
+
Project-URL: Documentation, https://github.com/AlphaTechini/doc-fetch#readme
|
|
12
|
+
Keywords: documentation,ai,llm,markdown,crawler,security
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Documentation
|
|
24
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Requires-Python: >=3.7
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
Dynamic: author
|
|
29
|
+
Dynamic: home-page
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
|
|
32
|
+
# DocFetch - Dynamic Documentation Fetcher 📚
|
|
33
|
+
|
|
34
|
+
**Transform entire documentation sites into AI-ready, single-file markdown with intelligent LLM.txt indexing**
|
|
35
|
+
|
|
36
|
+
Most AIs can't navigate documentation like humans do. They can't scroll through sections, click sidebar links, or explore related pages. **DocFetch solves this fundamental problem** by converting entire documentation sites into comprehensive, clean markdown files that contain every section and piece of information in a format that LLMs love.
|
|
37
|
+
|
|
38
|
+
## 🚀 Why DocFetch is Essential for AI Development
|
|
39
|
+
|
|
40
|
+
### 🤖 **AI/LLM Optimization**
|
|
41
|
+
- **Single-file consumption**: No more fragmented context across multiple pages
|
|
42
|
+
- **Clean, structured markdown**: Perfect token efficiency for LLM context windows
|
|
43
|
+
- **Intelligent LLM.txt generation**: AI-friendly index with semantic categorization
|
|
44
|
+
- **Noise removal**: Automatically strips navigation, headers, footers, ads, and buttons
|
|
45
|
+
|
|
46
|
+
### ⚡ **Developer Productivity**
|
|
47
|
+
- **One command automation**: Replace hours of manual copy-pasting with a single CLI command
|
|
48
|
+
- **Complete documentation access**: Give your AI agents full access to official documentation
|
|
49
|
+
- **Consistent formatting**: Uniform structure across different documentation sites
|
|
50
|
+
- **Version control friendly**: Markdown files work perfectly with Git
|
|
51
|
+
|
|
52
|
+
### 🎯 **Smart Content Intelligence**
|
|
53
|
+
- **Automatic page classification**: Identifies APIs, guides, references, and examples
|
|
54
|
+
- **Semantic descriptions**: Generates concise, relevant descriptions for each section
|
|
55
|
+
- **URL preservation**: Maintains original source links for verification
|
|
56
|
+
- **Adaptive content extraction**: Works with diverse documentation site structures
|
|
57
|
+
|
|
58
|
+
### 🔧 **Production Ready**
|
|
59
|
+
- **Concurrent fetching**: Fast downloads with configurable concurrency
|
|
60
|
+
- **Respectful crawling**: Honors robots.txt and includes rate limiting
|
|
61
|
+
- **Cross-platform**: Works on Windows, macOS, and Linux
|
|
62
|
+
- **Multiple installation options**: NPM, Go install, or direct binary download
|
|
63
|
+
|
|
64
|
+
## 📦 Installation
|
|
65
|
+
|
|
66
|
+
### PyPI (Recommended for Python developers) ✨ NEW
|
|
67
|
+
```bash
|
|
68
|
+
pip install doc-fetch
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### NPM (Recommended for JavaScript/Node.js developers)
|
|
72
|
+
```bash
|
|
73
|
+
npm install -g doc-fetch
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Go (For Go developers)
|
|
77
|
+
```bash
|
|
78
|
+
go install github.com/AlphaTechini/doc-fetch/cmd/docfetch@latest
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Direct Binary Download
|
|
82
|
+
Visit [Releases](https://github.com/AlphaTechini/doc-fetch/releases) and download your platform's binary.
|
|
83
|
+
|
|
84
|
+
## 🎯 Usage
|
|
85
|
+
|
|
86
|
+
### Basic Usage
|
|
87
|
+
```bash
|
|
88
|
+
# Fetch entire documentation site to single markdown file
|
|
89
|
+
doc-fetch --url https://golang.org/doc/ --output ./docs/golang-full.md
|
|
90
|
+
|
|
91
|
+
# With LLM.txt generation for AI optimization
|
|
92
|
+
doc-fetch --url https://react.dev/learn --output docs.md --llm-txt
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Advanced Usage
|
|
96
|
+
```bash
|
|
97
|
+
# Comprehensive documentation fetch with all features
|
|
98
|
+
doc-fetch \
|
|
99
|
+
--url https://docs.example.com \
|
|
100
|
+
--output ./internal/docs.md \
|
|
101
|
+
--depth 4 \
|
|
102
|
+
--concurrent 10 \
|
|
103
|
+
--llm-txt \
|
|
104
|
+
--user-agent "MyBot/1.0"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Command Options
|
|
108
|
+
| Flag | Short | Description | Default |
|
|
109
|
+
|------|-------|-------------|---------|
|
|
110
|
+
| `--url` | `-u` | Base URL to fetch documentation from | **Required** |
|
|
111
|
+
| `--output` | `-o` | Output file path | `docs.md` |
|
|
112
|
+
| `--depth` | `-d` | Maximum crawl depth | `2` |
|
|
113
|
+
| `--concurrent` | `-c` | Number of concurrent fetchers | `3` |
|
|
114
|
+
| `--llm-txt` | | Generate AI-friendly llm.txt index | `false` |
|
|
115
|
+
| `--user-agent` | | Custom user agent string | `DocFetch/1.0` |
|
|
116
|
+
|
|
117
|
+
## 📁 Output Files
|
|
118
|
+
|
|
119
|
+
When using `--llm-txt`, DocFetch generates two files:
|
|
120
|
+
|
|
121
|
+
### `docs.md` - Complete Documentation
|
|
122
|
+
```markdown
|
|
123
|
+
# Documentation
|
|
124
|
+
|
|
125
|
+
This file contains documentation fetched by DocFetch.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Getting Started
|
|
130
|
+
|
|
131
|
+
This guide covers installation, setup, and first program...
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Language Specification
|
|
136
|
+
|
|
137
|
+
Complete Go language specification and syntax...
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### `docs.llm.txt` - AI-Friendly Index
|
|
141
|
+
```txt
|
|
142
|
+
# llm.txt - AI-friendly documentation index
|
|
143
|
+
|
|
144
|
+
[GUIDE] Getting Started
|
|
145
|
+
https://golang.org/doc/install
|
|
146
|
+
Covers installation, setup, and first program.
|
|
147
|
+
|
|
148
|
+
[REFERENCE] Language Specification
|
|
149
|
+
https://golang.org/ref/spec
|
|
150
|
+
Complete Go language specification and syntax.
|
|
151
|
+
|
|
152
|
+
[API] net/http
|
|
153
|
+
https://pkg.go.dev/net/http
|
|
154
|
+
HTTP client/server implementation.
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## 🌟 Real-World Examples
|
|
158
|
+
|
|
159
|
+
### Fetch Go Documentation
|
|
160
|
+
```bash
|
|
161
|
+
doc-fetch --url https://golang.org/doc/ --output ./docs/go-documentation.md --depth 4 --llm-txt
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Fetch React Documentation
|
|
165
|
+
```bash
|
|
166
|
+
doc-fetch --url https://react.dev/learn --output ./docs/react-learn.md --concurrent 10 --llm-txt
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Fetch Your Own Project Docs
|
|
170
|
+
```bash
|
|
171
|
+
doc-fetch --url https://your-project.com/docs/ --output ./internal/docs.md --llm-txt
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## 🤖 How LLM.txt Supercharges Your AI
|
|
175
|
+
|
|
176
|
+
The generated `llm.txt` file acts as a **semantic roadmap** for your AI agents:
|
|
177
|
+
|
|
178
|
+
1. **Precise Navigation**: Agents can query specific sections without scanning entire documents
|
|
179
|
+
2. **Context Awareness**: Know whether they're looking at an API reference vs. a tutorial
|
|
180
|
+
3. **Efficient Retrieval**: Jump directly to relevant content based on query intent
|
|
181
|
+
4. **Source Verification**: Always maintain links back to original documentation
|
|
182
|
+
|
|
183
|
+
**Example AI Prompt Enhancement:**
|
|
184
|
+
```
|
|
185
|
+
Instead of: "What does the net/http package do?"
|
|
186
|
+
Your AI can now: "Check the [API] net/http section in llm.txt for HTTP client/server implementation details"
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 🏗️ How It Works
|
|
190
|
+
|
|
191
|
+
1. **Link Discovery**: Parses the base URL to find all internal documentation links
|
|
192
|
+
2. **Content Fetching**: Downloads all pages concurrently with respect for robots.txt
|
|
193
|
+
3. **HTML Cleaning**: Removes non-content elements (navigation, headers, footers, etc.)
|
|
194
|
+
4. **Markdown Conversion**: Converts cleaned HTML to structured markdown
|
|
195
|
+
5. **Intelligent Classification**: Categorizes pages as API, GUIDE, REFERENCE, or EXAMPLE
|
|
196
|
+
6. **Description Generation**: Creates concise, relevant descriptions for each section
|
|
197
|
+
7. **Single File Output**: Combines all documentation into one comprehensive file
|
|
198
|
+
8. **LLM.txt Generation**: Creates AI-friendly index with semantic categorization
|
|
199
|
+
|
|
200
|
+
## 🚀 Future Features
|
|
201
|
+
|
|
202
|
+
- **Incremental updates**: Only fetch changed pages on subsequent runs
|
|
203
|
+
- **Custom selectors**: Allow users to specify content areas for different sites
|
|
204
|
+
- **Multiple formats**: Support PDF, JSON, and other output formats
|
|
205
|
+
- **Token counting**: Estimate token usage for LLM context planning
|
|
206
|
+
- **Advanced classification**: Machine learning-based page type detection
|
|
207
|
+
|
|
208
|
+
## 💡 Why This Exists
|
|
209
|
+
|
|
210
|
+
Traditional documentation sites are designed for **human navigation**, not **AI consumption**. When working with LLMs, you often need to manually copy-paste multiple sections or provide incomplete context. DocFetch automates this process, giving your AI agents complete access to documentation without the manual overhead.
|
|
211
|
+
|
|
212
|
+
**Stop wasting time copying documentation. Start building AI agents with complete knowledge.**
|
|
213
|
+
|
|
214
|
+
## 🤝 Contributing
|
|
215
|
+
|
|
216
|
+
Contributions are welcome! Please open an issue or pull request on GitHub.
|
|
217
|
+
|
|
218
|
+
## 📄 License
|
|
219
|
+
|
|
220
|
+
MIT License
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
**Built with ❤️ for AI developers who deserve better documentation access**
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
go.mod
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
cmd/docfetch/main.go
|
|
6
|
+
doc_fetch/__init__.py
|
|
7
|
+
doc_fetch/__main__.py
|
|
8
|
+
doc_fetch/cli.py
|
|
9
|
+
doc_fetch.egg-info/PKG-INFO
|
|
10
|
+
doc_fetch.egg-info/SOURCES.txt
|
|
11
|
+
doc_fetch.egg-info/dependency_links.txt
|
|
12
|
+
doc_fetch.egg-info/entry_points.txt
|
|
13
|
+
doc_fetch.egg-info/not-zip-safe
|
|
14
|
+
doc_fetch.egg-info/top_level.txt
|
|
15
|
+
docs/usage.md
|
|
16
|
+
examples/golang-example.sh
|
|
17
|
+
pkg/fetcher/fetcher.go
|
|
18
|
+
pkg/fetcher/html2md.go
|
|
19
|
+
pkg/fetcher/writer.go
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
doc_fetch
|
package/docs/usage.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# DocFetch Usage Guide
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
# Clone the repository
|
|
7
|
+
git clone https://github.com/AlphaTechini/doc-fetch.git
|
|
8
|
+
cd doc-fetch
|
|
9
|
+
|
|
10
|
+
# Build the binary
|
|
11
|
+
go build -o doc-fetch ./cmd/docfetch
|
|
12
|
+
|
|
13
|
+
# Or install directly
|
|
14
|
+
go install github.com/AlphaTechini/doc-fetch/cmd/docfetch@latest
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Basic Usage
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Fetch documentation from a URL to a markdown file
|
|
21
|
+
doc-fetch --url https://golang.org/doc/ --output docs.md
|
|
22
|
+
|
|
23
|
+
# Specify custom output path
|
|
24
|
+
doc-fetch --url https://docs.example.com --output ./documentation/full-docs.md
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Advanced Options
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Control crawl depth (default: 2)
|
|
31
|
+
doc-fetch --url https://docs.example.com --output docs.md --depth 3
|
|
32
|
+
|
|
33
|
+
# Set concurrent workers (default: 3)
|
|
34
|
+
doc-fetch --url https://docs.example.com --output docs.md --concurrent 5
|
|
35
|
+
|
|
36
|
+
# Custom user agent
|
|
37
|
+
doc-fetch --url https://docs.example.com --output docs.md --user-agent "MyBot/1.0"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Supported Documentation Sites
|
|
41
|
+
|
|
42
|
+
DocFetch works best with sites that have:
|
|
43
|
+
- Clear content structure
|
|
44
|
+
- Standard HTML markup
|
|
45
|
+
- Proper semantic HTML elements
|
|
46
|
+
|
|
47
|
+
Common selectors used for content extraction:
|
|
48
|
+
- `<main>`
|
|
49
|
+
- `<article>`
|
|
50
|
+
- `.content`, `.docs-content`
|
|
51
|
+
- `#main-content`
|
|
52
|
+
- `.documentation`
|
|
53
|
+
|
|
54
|
+
## Output Format
|
|
55
|
+
|
|
56
|
+
The output is clean markdown that includes:
|
|
57
|
+
- Page titles as H2 headings
|
|
58
|
+
- Cleaned content with formatting preserved
|
|
59
|
+
- Separation between different pages with `---`
|
|
60
|
+
|
|
61
|
+
## Future Features
|
|
62
|
+
|
|
63
|
+
- [ ] Recursive link crawling
|
|
64
|
+
- [ ] LLM.txt generation
|
|
65
|
+
- [ ] PDF and other format support
|
|
66
|
+
- [ ] Incremental updates
|
|
67
|
+
- [ ] Custom CSS selectors per site
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# Example usage for Go documentation
|
|
4
|
+
echo "Fetching Go documentation..."
|
|
5
|
+
|
|
6
|
+
# Basic usage
|
|
7
|
+
doc-fetch --url https://golang.org/doc/ --output ./docs/golang-full.md
|
|
8
|
+
|
|
9
|
+
# With custom settings
|
|
10
|
+
doc-fetch --url https://pkg.go.dev/std --output ./docs/go-stdlib.md --depth 3 --concurrent 5
|
|
11
|
+
|
|
12
|
+
echo "Documentation saved to docs/ directory"
|
package/go.mod
ADDED
package/go.sum
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
|
|
2
|
+
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
|
|
3
|
+
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
|
4
|
+
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
|
5
|
+
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
|
6
|
+
github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68=
|
|
7
|
+
github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
|
8
|
+
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
|
9
|
+
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
|
10
|
+
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
|
11
|
+
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
|
12
|
+
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
|
13
|
+
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
|
14
|
+
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
|
15
|
+
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
|
16
|
+
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
|
|
17
|
+
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
|
|
18
|
+
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
|
19
|
+
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
|
20
|
+
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
|
21
|
+
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
|
22
|
+
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
|
23
|
+
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
24
|
+
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
25
|
+
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
26
|
+
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
27
|
+
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
|
28
|
+
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
|
29
|
+
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
|
30
|
+
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
|
31
|
+
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
|
32
|
+
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
|
33
|
+
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
|
34
|
+
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
|
35
|
+
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
|
36
|
+
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
|
37
|
+
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
|
38
|
+
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
package/package.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "doc-fetch-cli",
|
|
3
|
+
"version": "1.0.2",
|
|
4
|
+
"description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
|
|
5
|
+
"bin": {
|
|
6
|
+
"doc-fetch": "./bin/doc-fetch.js"
|
|
7
|
+
},
|
|
8
|
+
"scripts": {
|
|
9
|
+
"postinstall": "node ./bin/install.js"
|
|
10
|
+
},
|
|
11
|
+
"repository": {
|
|
12
|
+
"type": "git",
|
|
13
|
+
"url": "https://github.com/AlphaTechini/doc-fetch.git"
|
|
14
|
+
},
|
|
15
|
+
"keywords": ["documentation", "ai", "llm", "markdown", "crawler", "security"],
|
|
16
|
+
"author": "AlphaTechini",
|
|
17
|
+
"license": "MIT"
|
|
18
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"strings"
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
// ClassifyPage determines the type of documentation page
|
|
8
|
+
func ClassifyPage(url, title string) string {
|
|
9
|
+
urlLower := strings.ToLower(url)
|
|
10
|
+
titleLower := strings.ToLower(title)
|
|
11
|
+
|
|
12
|
+
// API detection
|
|
13
|
+
if strings.Contains(urlLower, "/api/") ||
|
|
14
|
+
strings.Contains(urlLower, "/pkg/") ||
|
|
15
|
+
strings.Contains(urlLower, "/reference/pkg/") ||
|
|
16
|
+
strings.Contains(titleLower, "api") ||
|
|
17
|
+
strings.Contains(titleLower, "package") {
|
|
18
|
+
return "API"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Guide/Tutorial detection
|
|
22
|
+
if strings.Contains(urlLower, "/guide/") ||
|
|
23
|
+
strings.Contains(urlLower, "/tutorial/") ||
|
|
24
|
+
strings.Contains(urlLower, "/learn/") ||
|
|
25
|
+
strings.Contains(urlLower, "/docs/guides/") ||
|
|
26
|
+
strings.Contains(titleLower, "guide") ||
|
|
27
|
+
strings.Contains(titleLower, "tutorial") ||
|
|
28
|
+
strings.Contains(titleLower, "getting started") {
|
|
29
|
+
return "GUIDE"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Reference documentation
|
|
33
|
+
if strings.Contains(urlLower, "/ref/") ||
|
|
34
|
+
strings.Contains(urlLower, "/reference/") ||
|
|
35
|
+
strings.Contains(urlLower, "/spec/") ||
|
|
36
|
+
strings.Contains(titleLower, "reference") ||
|
|
37
|
+
strings.Contains(titleLower, "specification") {
|
|
38
|
+
return "REFERENCE"
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Examples
|
|
42
|
+
if strings.Contains(urlLower, "/example/") ||
|
|
43
|
+
strings.Contains(urlLower, "/examples/") ||
|
|
44
|
+
strings.Contains(titleLower, "example") {
|
|
45
|
+
return "EXAMPLE"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Default to section
|
|
49
|
+
return "SECTION"
|
|
50
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"strings"
|
|
5
|
+
"regexp"
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
// ExtractDescription creates a concise description from page content
|
|
9
|
+
func ExtractDescription(content string) string {
|
|
10
|
+
// Clean up the content
|
|
11
|
+
content = strings.TrimSpace(content)
|
|
12
|
+
|
|
13
|
+
// Remove extra whitespace and newlines
|
|
14
|
+
content = regexp.MustCompile(`\s+`).ReplaceAllString(content, " ")
|
|
15
|
+
|
|
16
|
+
// Split into sentences
|
|
17
|
+
sentences := strings.Split(content, ". ")
|
|
18
|
+
|
|
19
|
+
// Take first 1-2 sentences, but keep it under 200 characters
|
|
20
|
+
if len(sentences) >= 2 {
|
|
21
|
+
desc := sentences[0] + ". " + sentences[1] + "."
|
|
22
|
+
if len(desc) > 200 {
|
|
23
|
+
desc = sentences[0] + "."
|
|
24
|
+
}
|
|
25
|
+
return desc
|
|
26
|
+
} else if len(sentences) == 1 {
|
|
27
|
+
desc := sentences[0] + "."
|
|
28
|
+
if len(desc) > 200 {
|
|
29
|
+
// Truncate to 200 chars and add ellipsis
|
|
30
|
+
desc = desc[:197] + "..."
|
|
31
|
+
}
|
|
32
|
+
return desc
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Fallback description
|
|
36
|
+
return "Documentation page content."
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// CleanTitle removes common suffixes and prefixes
|
|
40
|
+
func CleanTitle(title string) string {
|
|
41
|
+
title = strings.TrimSpace(title)
|
|
42
|
+
|
|
43
|
+
// Common patterns to remove
|
|
44
|
+
patterns := []string{
|
|
45
|
+
" - Documentation",
|
|
46
|
+
" | Documentation",
|
|
47
|
+
" - Go",
|
|
48
|
+
" | Go",
|
|
49
|
+
" - React",
|
|
50
|
+
" | React",
|
|
51
|
+
" Documentation",
|
|
52
|
+
" Docs",
|
|
53
|
+
" API Reference",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
for _, pattern := range patterns {
|
|
57
|
+
title = strings.ReplaceAll(title, pattern, "")
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return strings.TrimSpace(title)
|
|
61
|
+
}
|