doc-fetch-cli 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +274 -0
- package/bin/postinstall.js +83 -0
- package/cmd/docfetch/main.go +2 -1
- package/dist/{doc_fetch-1.0.1-py3-none-any.whl → doc_fetch-1.1.1-py3-none-any.whl} +0 -0
- package/dist/doc_fetch-1.1.1.tar.gz +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_linux_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/doc_fetch.egg-info/PKG-INFO +1 -1
- package/doc_fetch.egg-info/SOURCES.txt +17 -0
- package/package.json +2 -2
- package/pkg/fetcher/extract_nav.go +163 -0
- package/pkg/fetcher/fetcher.go +103 -20
- package/pkg/fetcher/fetcher_optimized.go +318 -0
- package/pyproject.toml +1 -1
- package/setup.py +1 -1
- package/dist/doc_fetch-1.0.1.tar.gz +0 -0
package/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# Contributing to DocFetch
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to DocFetch! 🎉 This guide will help you get started.
|
|
4
|
+
|
|
5
|
+
## 📋 Quick Overview
|
|
6
|
+
|
|
7
|
+
1. **Create an issue first** - Always start by opening an issue to discuss your change
|
|
8
|
+
2. **Wait for feedback** - Maintainers will respond and provide guidance
|
|
9
|
+
3. **Fork and develop** - Once approved, fork the repo and make your changes
|
|
10
|
+
4. **Submit a PR** - Open a pull request referencing the issue
|
|
11
|
+
5. **Review process** - Maintainers will review and provide feedback
|
|
12
|
+
6. **Merge** - Once approved, your contribution will be merged!
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 🐛 Before You Start: Create an Issue
|
|
17
|
+
|
|
18
|
+
**⚠️ IMPORTANT: Always create an issue before submitting a PR!**
|
|
19
|
+
|
|
20
|
+
This helps us:
|
|
21
|
+
- Avoid duplicate work
|
|
22
|
+
- Discuss the best approach
|
|
23
|
+
- Ensure your contribution aligns with project goals
|
|
24
|
+
- Get early feedback from maintainers
|
|
25
|
+
|
|
26
|
+
### Types of Issues
|
|
27
|
+
|
|
28
|
+
#### 🐞 Bug Reports
|
|
29
|
+
Include:
|
|
30
|
+
- Clear description of the bug
|
|
31
|
+
- Steps to reproduce
|
|
32
|
+
- Expected vs actual behavior
|
|
33
|
+
- Environment details (OS, Go version, DocFetch version)
|
|
34
|
+
- Sample command that triggers the bug
|
|
35
|
+
- Error messages or logs
|
|
36
|
+
|
|
37
|
+
#### ✨ Feature Requests
|
|
38
|
+
Include:
|
|
39
|
+
- Clear description of the feature
|
|
40
|
+
- Use case / problem it solves
|
|
41
|
+
- Example usage
|
|
42
|
+
- Any relevant links or references
|
|
43
|
+
|
|
44
|
+
#### 📝 Documentation Improvements
|
|
45
|
+
Include:
|
|
46
|
+
- What needs improvement
|
|
47
|
+
- Why it's needed
|
|
48
|
+
- Suggested changes
|
|
49
|
+
|
|
50
|
+
#### ⚡ Performance Improvements
|
|
51
|
+
Include:
|
|
52
|
+
- Current performance metrics
|
|
53
|
+
- Proposed improvements
|
|
54
|
+
- Benchmark results (if available)
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 🚀 Development Setup
|
|
59
|
+
|
|
60
|
+
### Prerequisites
|
|
61
|
+
|
|
62
|
+
- Go 1.21 or later
|
|
63
|
+
- Git
|
|
64
|
+
- Make (optional, for running tests)
|
|
65
|
+
|
|
66
|
+
### Fork and Clone
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Fork the repository on GitHub, then:
|
|
70
|
+
git clone https://github.com/YOUR_USERNAME/doc-fetch.git
|
|
71
|
+
cd doc-fetch
|
|
72
|
+
|
|
73
|
+
# Add upstream remote
|
|
74
|
+
git remote add upstream https://github.com/AlphaTechini/doc-fetch.git
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Build from Source
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Build the binary
|
|
81
|
+
go build -o doc-fetch ./cmd/docfetch
|
|
82
|
+
|
|
83
|
+
# Test it works
|
|
84
|
+
./doc-fetch --help
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Run Tests
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Run all tests
|
|
91
|
+
go test ./...
|
|
92
|
+
|
|
93
|
+
# Run tests with coverage
|
|
94
|
+
go test -cover ./...
|
|
95
|
+
|
|
96
|
+
# Run specific package tests
|
|
97
|
+
go test ./pkg/fetcher/...
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 💻 Making Changes
|
|
103
|
+
|
|
104
|
+
### Branch Naming
|
|
105
|
+
|
|
106
|
+
Use descriptive branch names:
|
|
107
|
+
- `fix/content-extraction-bug`
|
|
108
|
+
- `feat/add-pdf-support`
|
|
109
|
+
- `docs/update-readme-examples`
|
|
110
|
+
- `perf/improve-concurrent-fetching`
|
|
111
|
+
|
|
112
|
+
### Code Style
|
|
113
|
+
|
|
114
|
+
Follow Go best practices:
|
|
115
|
+
- Run `go fmt` before committing
|
|
116
|
+
- Run `go vet` to catch issues
|
|
117
|
+
- Write clear, concise comments
|
|
118
|
+
- Keep functions small and focused
|
|
119
|
+
- Use meaningful variable names
|
|
120
|
+
|
|
121
|
+
### Testing Requirements
|
|
122
|
+
|
|
123
|
+
- Add tests for new features
|
|
124
|
+
- Ensure existing tests pass
|
|
125
|
+
- Include edge cases
|
|
126
|
+
- Test with real documentation sites
|
|
127
|
+
|
|
128
|
+
Example test:
|
|
129
|
+
```go
|
|
130
|
+
func TestContentExtraction(t *testing.T) {
|
|
131
|
+
doc := createTestDocument()
|
|
132
|
+
content := cleanContent(doc)
|
|
133
|
+
|
|
134
|
+
if len(content) == 0 {
|
|
135
|
+
t.Error("Expected content to be extracted")
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if !strings.Contains(content, "expected text") {
|
|
139
|
+
t.Error("Expected content to contain specific text")
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## 📤 Submitting a Pull Request
|
|
147
|
+
|
|
148
|
+
### PR Checklist
|
|
149
|
+
|
|
150
|
+
Before submitting your PR, ensure:
|
|
151
|
+
|
|
152
|
+
- [ ] You created an issue first and referenced it in the PR
|
|
153
|
+
- [ ] Your code follows Go style guidelines
|
|
154
|
+
- [ ] All tests pass (`go test ./...`)
|
|
155
|
+
- [ ] You've added tests for new functionality
|
|
156
|
+
- [ ] You've updated documentation if needed
|
|
157
|
+
- [ ] Your commit messages are clear and descriptive
|
|
158
|
+
- [ ] You've rebased on the latest main branch
|
|
159
|
+
|
|
160
|
+
### PR Template
|
|
161
|
+
|
|
162
|
+
When creating your PR, include:
|
|
163
|
+
|
|
164
|
+
```markdown
|
|
165
|
+
## Description
|
|
166
|
+
Brief description of changes
|
|
167
|
+
|
|
168
|
+
## Related Issue
|
|
169
|
+
Fixes #123 (or "Related to #123")
|
|
170
|
+
|
|
171
|
+
## Type of Change
|
|
172
|
+
- [ ] Bug fix
|
|
173
|
+
- [ ] New feature
|
|
174
|
+
- [ ] Breaking change
|
|
175
|
+
- [ ] Documentation update
|
|
176
|
+
- [ ] Performance improvement
|
|
177
|
+
- [ ] Refactoring
|
|
178
|
+
|
|
179
|
+
## Testing
|
|
180
|
+
Describe how you tested this:
|
|
181
|
+
- [ ] Unit tests added/updated
|
|
182
|
+
- [ ] Manual testing with real docs
|
|
183
|
+
- [ ] Tested on: [list platforms]
|
|
184
|
+
|
|
185
|
+
## Example Usage
|
|
186
|
+
Show example command and output if applicable
|
|
187
|
+
|
|
188
|
+
## Checklist
|
|
189
|
+
- [ ] Code follows project guidelines
|
|
190
|
+
- [ ] Self-review completed
|
|
191
|
+
- [ ] Comments added where needed
|
|
192
|
+
- [ ] Tests pass locally
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## 🔍 Review Process
|
|
198
|
+
|
|
199
|
+
1. **Automated Checks**: CI runs tests and linting
|
|
200
|
+
2. **Maintainer Review**: At least one maintainer reviews
|
|
201
|
+
3. **Feedback**: You may be asked to make changes
|
|
202
|
+
4. **Approval**: Once approved, PR is merged
|
|
203
|
+
5. **Release**: Changes included in next release
|
|
204
|
+
|
|
205
|
+
Typical timeline: 3-7 days for review
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## 📖 Contribution Ideas
|
|
210
|
+
|
|
211
|
+
Looking for ways to contribute? Here are some ideas:
|
|
212
|
+
|
|
213
|
+
### Easy Wins
|
|
214
|
+
- Fix typos in documentation
|
|
215
|
+
- Add more examples to README
|
|
216
|
+
- Improve error messages
|
|
217
|
+
- Add unit tests for existing code
|
|
218
|
+
|
|
219
|
+
### Intermediate
|
|
220
|
+
- Add support for new documentation site formats
|
|
221
|
+
- Improve content extraction selectors
|
|
222
|
+
- Add progress indicators
|
|
223
|
+
- Enhance LLM.txt generation
|
|
224
|
+
|
|
225
|
+
### Advanced
|
|
226
|
+
- Add PDF export support
|
|
227
|
+
- Implement incremental updates
|
|
228
|
+
- Add authentication support for private docs
|
|
229
|
+
- Create plugin system for custom extractors
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## 🤝 Community Guidelines
|
|
234
|
+
|
|
235
|
+
### Be Respectful
|
|
236
|
+
- Treat everyone with respect
|
|
237
|
+
- Welcome newcomers
|
|
238
|
+
- Provide constructive feedback
|
|
239
|
+
- Assume good intentions
|
|
240
|
+
|
|
241
|
+
### Communication
|
|
242
|
+
- Use clear, concise language
|
|
243
|
+
- Explain your reasoning
|
|
244
|
+
- Ask questions if unsure
|
|
245
|
+
- Respond to feedback promptly
|
|
246
|
+
|
|
247
|
+
### Collaboration
|
|
248
|
+
- Work with maintainers, not against them
|
|
249
|
+
- Be open to suggestions
|
|
250
|
+
- Help other contributors
|
|
251
|
+
- Share knowledge
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## 📜 License
|
|
256
|
+
|
|
257
|
+
By contributing to DocFetch, you agree that your contributions will be licensed under the MIT License.
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## ❓ Questions?
|
|
262
|
+
|
|
263
|
+
- **General questions**: Open a discussion on GitHub
|
|
264
|
+
- **Bug reports**: Create an issue
|
|
265
|
+
- **Feature requests**: Create an issue
|
|
266
|
+
- **Quick questions**: Check existing issues/discussions first
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## 🙏 Thank You!
|
|
271
|
+
|
|
272
|
+
Your contributions make DocFetch better for everyone. Whether it's a typo fix, a new feature, or better documentation - we appreciate your time and effort!
|
|
273
|
+
|
|
274
|
+
Happy coding! 🚀
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Post-install script for doc-fetch-cli
|
|
4
|
+
* Checks if global bin directory is in PATH and provides helpful instructions
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const { execSync } = require('child_process');
|
|
8
|
+
const path = require('path');
|
|
9
|
+
const os = require('os');
|
|
10
|
+
const fs = require('fs');
|
|
11
|
+
|
|
12
|
+
console.log('🎉 DocFetch CLI installed successfully!\n');
|
|
13
|
+
|
|
14
|
+
// Get npm global prefix
|
|
15
|
+
let globalPrefix;
|
|
16
|
+
try {
|
|
17
|
+
globalPrefix = execSync('npm config get prefix', { encoding: 'utf8' }).trim();
|
|
18
|
+
} catch (error) {
|
|
19
|
+
console.error('⚠️ Could not determine npm global prefix');
|
|
20
|
+
globalPrefix = null;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (globalPrefix) {
|
|
24
|
+
const binDir = path.join(globalPrefix, 'bin');
|
|
25
|
+
const isWindows = os.platform() === 'win32';
|
|
26
|
+
|
|
27
|
+
console.log(`📦 Installed to: ${binDir}\n`);
|
|
28
|
+
|
|
29
|
+
// Check if bin directory is in PATH
|
|
30
|
+
const pathEnv = process.env.PATH || '';
|
|
31
|
+
const pathDirs = pathEnv.split(isWindows ? ';' : ':');
|
|
32
|
+
const isInPath = pathDirs.some(dir => path.resolve(dir) === path.resolve(binDir));
|
|
33
|
+
|
|
34
|
+
if (!isInPath) {
|
|
35
|
+
console.log('⚠️ WARNING: Global bin directory is not in your PATH!\n');
|
|
36
|
+
console.log('To use doc-fetch-cli, add this directory to your PATH:\n');
|
|
37
|
+
console.log(` ${binDir}\n`);
|
|
38
|
+
|
|
39
|
+
// Provide platform-specific instructions
|
|
40
|
+
const shell = process.env.SHELL || '/bin/bash';
|
|
41
|
+
const isZsh = shell.includes('zsh');
|
|
42
|
+
const isBash = shell.includes('bash');
|
|
43
|
+
|
|
44
|
+
console.log('Quick fix:\n');
|
|
45
|
+
|
|
46
|
+
if (isWindows) {
|
|
47
|
+
console.log('1. Open System Properties → Environment Variables');
|
|
48
|
+
console.log('2. Edit PATH variable');
|
|
49
|
+
console.log('3. Add this path:');
|
|
50
|
+
console.log(` ${binDir}`);
|
|
51
|
+
console.log('4. Restart your terminal\n');
|
|
52
|
+
} else if (isZsh) {
|
|
53
|
+
console.log('Add this to your ~/.zshrc:');
|
|
54
|
+
console.log(` export PATH="${binDir}:$PATH"\n`);
|
|
55
|
+
console.log('Then run: source ~/.zshrc\n');
|
|
56
|
+
} else if (isBash) {
|
|
57
|
+
console.log('Add this to your ~/.bashrc or ~/.bash_profile:');
|
|
58
|
+
console.log(` export PATH="${binDir}:$PATH"\n`);
|
|
59
|
+
console.log('Then run: source ~/.bashrc\n');
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
console.log('Alternative: Use npx without installing globally\n');
|
|
63
|
+
console.log(' npx doc-fetch-cli --url https://docs.example.com --output docs.md\n');
|
|
64
|
+
} else {
|
|
65
|
+
console.log('✅ Global bin directory is in your PATH\n');
|
|
66
|
+
console.log('You can now use doc-fetch-cli!\n');
|
|
67
|
+
console.log('Example usage:');
|
|
68
|
+
console.log(' doc-fetch --url https://docs.python.org/3 --output docs.md --llm-txt\n');
|
|
69
|
+
|
|
70
|
+
// Test if the command works
|
|
71
|
+
try {
|
|
72
|
+
execSync('doc-fetch --version', { encoding: 'utf8', stdio: 'pipe' });
|
|
73
|
+
console.log('✅ Command verified working!\n');
|
|
74
|
+
} catch (error) {
|
|
75
|
+
console.log('⚠️ Command not found in current shell session.\n');
|
|
76
|
+
console.log('Try running: hash -r (to clear command cache)\n');
|
|
77
|
+
console.log('Or restart your terminal.\n');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
console.log('📚 Documentation: https://github.com/AlphaTechini/doc-fetch\n');
|
|
83
|
+
console.log('✨ Pro tip: Use --llm-txt flag to generate AI-friendly index files!\n');
|
package/cmd/docfetch/main.go
CHANGED
|
@@ -36,7 +36,8 @@ func main() {
|
|
|
36
36
|
log.Fatalf("Configuration error: %v", err)
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
// Use optimized high-performance fetcher
|
|
40
|
+
err := fetcher.RunOptimized(config)
|
|
40
41
|
if err != nil {
|
|
41
42
|
log.Fatalf("Failed to fetch documentation: %v", err)
|
|
42
43
|
}
|
|
Binary file
|
|
Binary file
|
package/doc-fetch_darwin_amd64
CHANGED
|
Binary file
|
package/doc-fetch_linux_amd64
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: doc-fetch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
|
|
5
5
|
Home-page: https://github.com/AlphaTechini/doc-fetch
|
|
6
6
|
Author: AlphaTechini
|
|
@@ -1,7 +1,18 @@
|
|
|
1
|
+
CONTRIBUTING.md
|
|
1
2
|
README.md
|
|
3
|
+
SECURITY.md
|
|
4
|
+
doc-fetch
|
|
5
|
+
doc-fetch_darwin_amd64
|
|
6
|
+
doc-fetch_linux_amd64
|
|
7
|
+
doc-fetch_windows_amd64.exe
|
|
2
8
|
go.mod
|
|
9
|
+
go.sum
|
|
10
|
+
package.json
|
|
3
11
|
pyproject.toml
|
|
4
12
|
setup.py
|
|
13
|
+
bin/doc-fetch.js
|
|
14
|
+
bin/install.js
|
|
15
|
+
bin/postinstall.js
|
|
5
16
|
cmd/docfetch/main.go
|
|
6
17
|
doc_fetch/__init__.py
|
|
7
18
|
doc_fetch/__main__.py
|
|
@@ -14,6 +25,12 @@ doc_fetch.egg-info/not-zip-safe
|
|
|
14
25
|
doc_fetch.egg-info/top_level.txt
|
|
15
26
|
docs/usage.md
|
|
16
27
|
examples/golang-example.sh
|
|
28
|
+
pkg/fetcher/classifier.go
|
|
29
|
+
pkg/fetcher/describer.go
|
|
30
|
+
pkg/fetcher/extract_nav.go
|
|
17
31
|
pkg/fetcher/fetcher.go
|
|
32
|
+
pkg/fetcher/fetcher_optimized.go
|
|
18
33
|
pkg/fetcher/html2md.go
|
|
34
|
+
pkg/fetcher/llmtxt.go
|
|
35
|
+
pkg/fetcher/validator.go
|
|
19
36
|
pkg/fetcher/writer.go
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "doc-fetch-cli",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.1",
|
|
4
4
|
"description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
|
|
5
5
|
"bin": {
|
|
6
6
|
"doc-fetch": "./bin/doc-fetch.js"
|
|
7
7
|
},
|
|
8
8
|
"scripts": {
|
|
9
|
-
"postinstall": "node ./bin/
|
|
9
|
+
"postinstall": "node ./bin/postinstall.js"
|
|
10
10
|
},
|
|
11
11
|
"repository": {
|
|
12
12
|
"type": "git",
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"strings"
|
|
6
|
+
|
|
7
|
+
"github.com/PuerkitoBio/goquery"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
// ExtractNavigationStructure extracts nav elements with h2/h3, ul/li, and hrefs
|
|
11
|
+
func ExtractNavigationStructure(doc *goquery.Document) string {
|
|
12
|
+
var result strings.Builder
|
|
13
|
+
|
|
14
|
+
result.WriteString("# Navigation Structure\n\n")
|
|
15
|
+
|
|
16
|
+
// Find all nav elements
|
|
17
|
+
doc.Find("nav").Each(func(i int, nav *goquery.Selection) {
|
|
18
|
+
result.WriteString(fmt.Sprintf("## Navigation Block %d\n\n", i+1))
|
|
19
|
+
|
|
20
|
+
// Look for headings in nav
|
|
21
|
+
nav.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
|
|
22
|
+
tagName := h.Get(0).Data
|
|
23
|
+
text := strings.TrimSpace(h.Text())
|
|
24
|
+
if text != "" {
|
|
25
|
+
result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Find ul under this heading
|
|
29
|
+
h.NextFiltered("ul").Each(func(k int, ul *goquery.Selection) {
|
|
30
|
+
result.WriteString(extractListWithLinks(ul, 1))
|
|
31
|
+
})
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
// Also find ul directly in nav
|
|
35
|
+
nav.ChildrenFiltered("ul").Each(func(k int, ul *goquery.Selection) {
|
|
36
|
+
result.WriteString(extractListWithLinks(ul, 1))
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
result.WriteString("---\n\n")
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
// Also look for elements with navigation-related classes/ids
|
|
43
|
+
navSelectors := []string{
|
|
44
|
+
"[class*='nav']",
|
|
45
|
+
"[id*='nav']",
|
|
46
|
+
"[class*='menu']",
|
|
47
|
+
"[id*='menu']",
|
|
48
|
+
"[role='navigation']",
|
|
49
|
+
".toc",
|
|
50
|
+
"#toc",
|
|
51
|
+
"[class*='toc']",
|
|
52
|
+
"[id*='toc']",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
for _, selector := range navSelectors {
|
|
56
|
+
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
|
|
57
|
+
// Skip if already processed as nav element
|
|
58
|
+
if s.Parent().Is("nav") {
|
|
59
|
+
return
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
result.WriteString(fmt.Sprintf("## Navigation Element (matched: %s)\n\n", selector))
|
|
63
|
+
|
|
64
|
+
// Extract headings
|
|
65
|
+
s.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
|
|
66
|
+
tagName := h.Get(0).Data
|
|
67
|
+
text := strings.TrimSpace(h.Text())
|
|
68
|
+
if text != "" {
|
|
69
|
+
result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
|
|
70
|
+
}
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
// Extract lists with links
|
|
74
|
+
s.Find("ul, ol").Each(func(k int, list *goquery.Selection) {
|
|
75
|
+
result.WriteString(extractListWithLinks(list, 1))
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
result.WriteString("---\n\n")
|
|
79
|
+
})
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return result.String()
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// extractListWithLinks extracts list items with their href attributes
|
|
86
|
+
func extractListWithLinks(list *goquery.Selection, indentLevel int) string {
|
|
87
|
+
var result strings.Builder
|
|
88
|
+
|
|
89
|
+
indent := strings.Repeat(" ", indentLevel)
|
|
90
|
+
|
|
91
|
+
list.Find("> li").Each(func(i int, li *goquery.Selection) {
|
|
92
|
+
// Get the text
|
|
93
|
+
text := strings.TrimSpace(li.Text())
|
|
94
|
+
|
|
95
|
+
// Find any links in this li
|
|
96
|
+
li.Find("a[href]").Each(func(j int, a *goquery.Selection) {
|
|
97
|
+
href, exists := a.Attr("href")
|
|
98
|
+
linkText := strings.TrimSpace(a.Text())
|
|
99
|
+
if exists && href != "" {
|
|
100
|
+
result.WriteString(fmt.Sprintf("%s- [%s](%s)\n", indent, linkText, href))
|
|
101
|
+
}
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
// If no links found, just add the text
|
|
105
|
+
if li.Find("a[href]").Length() == 0 && text != "" {
|
|
106
|
+
result.WriteString(fmt.Sprintf("%s- %s\n", indent, text))
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Recursively process nested lists
|
|
110
|
+
li.ChildrenFiltered("ul, ol").Each(func(k int, nested *goquery.Selection) {
|
|
111
|
+
result.WriteString(extractListWithLinks(nested, indentLevel+1))
|
|
112
|
+
})
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return result.String()
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ExtractAllLinks extracts all links from the page with context
|
|
119
|
+
func ExtractAllLinks(doc *goquery.Document, baseURL string) string {
|
|
120
|
+
var result strings.Builder
|
|
121
|
+
|
|
122
|
+
result.WriteString("# All Links Found\n\n")
|
|
123
|
+
|
|
124
|
+
linksFound := 0
|
|
125
|
+
|
|
126
|
+
// Group links by section
|
|
127
|
+
doc.Find("section, article, div[class*='content'], div[id*='content']").Each(func(i int, section *goquery.Selection) {
|
|
128
|
+
sectionLinks := 0
|
|
129
|
+
var sectionResult strings.Builder
|
|
130
|
+
|
|
131
|
+
// Get section title
|
|
132
|
+
title := ""
|
|
133
|
+
section.Find("h1, h2, h3").First().Each(func(j int, h *goquery.Selection) {
|
|
134
|
+
title = strings.TrimSpace(h.Text())
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
if title == "" {
|
|
138
|
+
title = fmt.Sprintf("Section %d", i+1)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
sectionResult.WriteString(fmt.Sprintf("## %s\n\n", title))
|
|
142
|
+
|
|
143
|
+
// Find all links in this section
|
|
144
|
+
section.Find("a[href]").Each(func(j int, a *goquery.Selection) {
|
|
145
|
+
href, exists := a.Attr("href")
|
|
146
|
+
text := strings.TrimSpace(a.Text())
|
|
147
|
+
if exists && href != "" && text != "" {
|
|
148
|
+
sectionResult.WriteString(fmt.Sprintf("- [%s](%s)\n", text, href))
|
|
149
|
+
sectionLinks++
|
|
150
|
+
linksFound++
|
|
151
|
+
}
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
if sectionLinks > 0 {
|
|
155
|
+
result.WriteString(sectionResult.String())
|
|
156
|
+
result.WriteString("\n")
|
|
157
|
+
}
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
result.WriteString(fmt.Sprintf("\n**Total links found: %d**\n", linksFound))
|
|
161
|
+
|
|
162
|
+
return result.String()
|
|
163
|
+
}
|
package/pkg/fetcher/fetcher.go
CHANGED
|
@@ -195,53 +195,136 @@ func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mu
|
|
|
195
195
|
}
|
|
196
196
|
}
|
|
197
197
|
|
|
198
|
-
// cleanContent extracts and cleans the main documentation content
|
|
198
|
+
// cleanContent extracts and cleans the main documentation content using multiple strategies
|
|
199
199
|
func cleanContent(doc *goquery.Document) string {
|
|
200
|
-
//
|
|
201
|
-
|
|
200
|
+
// Strategy 1: Try semantic HTML5 elements (most reliable)
|
|
201
|
+
semanticSelectors := []string{
|
|
202
202
|
"main",
|
|
203
203
|
"article",
|
|
204
|
+
"[role='main']",
|
|
205
|
+
"[role='article']",
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
for _, selector := range semanticSelectors {
|
|
209
|
+
if el := doc.Find(selector); el.Length() > 0 {
|
|
210
|
+
content := extractTextContent(el)
|
|
211
|
+
if len(content) > 200 { // Minimum viable content
|
|
212
|
+
return content
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Strategy 2: Try common class/id patterns
|
|
218
|
+
classSelectors := []string{
|
|
204
219
|
".content",
|
|
205
|
-
".docs-content",
|
|
220
|
+
".docs-content",
|
|
206
221
|
"#main-content",
|
|
207
222
|
".documentation",
|
|
208
223
|
".post-content",
|
|
209
224
|
".markdown-body",
|
|
210
225
|
".content-wrapper",
|
|
211
226
|
".doc-content",
|
|
227
|
+
".document",
|
|
228
|
+
".entry-content",
|
|
229
|
+
".page-content",
|
|
230
|
+
".article-content",
|
|
231
|
+
"[class*='content']",
|
|
232
|
+
"[class*='docs']",
|
|
233
|
+
"[class*='document']",
|
|
234
|
+
"[id*='content']",
|
|
235
|
+
"[id*='main']",
|
|
212
236
|
}
|
|
213
237
|
|
|
214
|
-
|
|
215
|
-
for _, selector := range selectors {
|
|
238
|
+
for _, selector := range classSelectors {
|
|
216
239
|
if el := doc.Find(selector); el.Length() > 0 {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
// Convert to HTML and then clean
|
|
221
|
-
htmlContent, err := el.Html()
|
|
222
|
-
if err != nil {
|
|
223
|
-
continue
|
|
240
|
+
content := extractTextContent(el)
|
|
241
|
+
if len(content) > 200 {
|
|
242
|
+
return content
|
|
224
243
|
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Strategy 3: Look for sections with high text density
|
|
248
|
+
var bestSection *goquery.Selection
|
|
249
|
+
maxTextLen := 0
|
|
250
|
+
|
|
251
|
+
doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
|
|
252
|
+
text := strings.TrimSpace(s.Text())
|
|
253
|
+
if len(text) > maxTextLen {
|
|
254
|
+
// Check if this section has more text than child elements
|
|
255
|
+
childText := 0
|
|
256
|
+
s.Children().Each(func(j int, c *goquery.Selection) {
|
|
257
|
+
childText += len(strings.TrimSpace(c.Text()))
|
|
258
|
+
})
|
|
225
259
|
|
|
226
|
-
//
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
260
|
+
// If parent has significantly more text, it's likely the main content
|
|
261
|
+
if len(text) > childText + (childText/2) && len(text) > 500 {
|
|
262
|
+
maxTextLen = len(text)
|
|
263
|
+
bestSection = s
|
|
230
264
|
}
|
|
231
265
|
}
|
|
266
|
+
})
|
|
267
|
+
|
|
268
|
+
if bestSection != nil {
|
|
269
|
+
content := extractTextContent(bestSection)
|
|
270
|
+
if len(content) > 200 {
|
|
271
|
+
return content
|
|
272
|
+
}
|
|
232
273
|
}
|
|
233
274
|
|
|
234
|
-
//
|
|
275
|
+
// Strategy 4: Fallback to body with aggressive cleaning
|
|
235
276
|
body := doc.Find("body")
|
|
236
277
|
if body.Length() > 0 {
|
|
237
|
-
|
|
278
|
+
// Remove all non-content elements aggressively
|
|
279
|
+
body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
|
|
280
|
+
|
|
281
|
+
// Find the largest remaining container
|
|
282
|
+
var largest *goquery.Selection
|
|
283
|
+
largestSize := 0
|
|
284
|
+
|
|
285
|
+
body.Find("*").Each(func(i int, s *goquery.Selection) {
|
|
286
|
+
text := strings.TrimSpace(s.Text())
|
|
287
|
+
if len(text) > largestSize && s.Children().Length() < 50 {
|
|
288
|
+
largestSize = len(text)
|
|
289
|
+
largest = s
|
|
290
|
+
}
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
if largest != nil {
|
|
294
|
+
content := extractTextContent(largest)
|
|
295
|
+
if len(content) > 200 {
|
|
296
|
+
return content
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Last resort: entire body
|
|
238
301
|
htmlContent, _ := body.Html()
|
|
239
|
-
|
|
302
|
+
cleaned := cleanHTML(htmlContent)
|
|
303
|
+
if len(cleaned) > 200 {
|
|
304
|
+
return cleaned
|
|
305
|
+
}
|
|
240
306
|
}
|
|
241
307
|
|
|
242
308
|
return ""
|
|
243
309
|
}
|
|
244
310
|
|
|
311
|
+
// extractTextContent extracts and cleans text from a selection
|
|
312
|
+
func extractTextContent(sel *goquery.Selection) string {
|
|
313
|
+
// Clone the selection to avoid modifying original
|
|
314
|
+
clone := sel.Clone()
|
|
315
|
+
|
|
316
|
+
// Remove unwanted elements
|
|
317
|
+
clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
|
|
318
|
+
|
|
319
|
+
// Get HTML and convert to clean text
|
|
320
|
+
htmlContent, err := clone.Html()
|
|
321
|
+
if err != nil {
|
|
322
|
+
return ""
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
return cleanHTML(htmlContent)
|
|
326
|
+
}
|
|
327
|
+
|
|
245
328
|
// cleanHTML performs basic HTML cleaning
|
|
246
329
|
func cleanHTML(htmlStr string) string {
|
|
247
330
|
// Parse and extract text content while preserving structure
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"bufio"
|
|
5
|
+
"context"
|
|
6
|
+
"fmt"
|
|
7
|
+
"log"
|
|
8
|
+
"net"
|
|
9
|
+
"net/http"
|
|
10
|
+
"net/url"
|
|
11
|
+
"os"
|
|
12
|
+
"strings"
|
|
13
|
+
"sync"
|
|
14
|
+
"sync/atomic"
|
|
15
|
+
"time"
|
|
16
|
+
|
|
17
|
+
"github.com/PuerkitoBio/goquery"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
// OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
|
|
21
|
+
type OptimizedFetcher struct {
|
|
22
|
+
config Config
|
|
23
|
+
httpClient *http.Client
|
|
24
|
+
urlQueue chan string
|
|
25
|
+
visited sync.Map // Concurrent map instead of mutex-protected map
|
|
26
|
+
resultsChan chan string
|
|
27
|
+
llmEntries []LLMTxtEntry
|
|
28
|
+
llmMutex sync.Mutex
|
|
29
|
+
pageCount int32
|
|
30
|
+
errorCount int32
|
|
31
|
+
ctx context.Context
|
|
32
|
+
cancel context.CancelFunc
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// RunOptimized executes documentation fetching with maximum concurrency
|
|
36
|
+
func RunOptimized(config Config) error {
|
|
37
|
+
if err := validateConfig(&config); err != nil {
|
|
38
|
+
return fmt.Errorf("invalid configuration: %w", err)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
|
|
42
|
+
log.Printf(" Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
|
|
43
|
+
|
|
44
|
+
fetcher := &OptimizedFetcher{
|
|
45
|
+
config: config,
|
|
46
|
+
urlQueue: make(chan string, config.Workers*100), // Large buffer for URLs
|
|
47
|
+
resultsChan: make(chan string, config.Workers*10), // Larger buffer
|
|
48
|
+
httpClient: createOptimizedHTTPClient(config.Workers),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
|
|
52
|
+
defer fetcher.cancel()
|
|
53
|
+
|
|
54
|
+
startTime := time.Now()
|
|
55
|
+
|
|
56
|
+
// Start result writer in background
|
|
57
|
+
var writeWg sync.WaitGroup
|
|
58
|
+
writeWg.Add(1)
|
|
59
|
+
go func() {
|
|
60
|
+
defer writeWg.Add(-1)
|
|
61
|
+
writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
|
|
62
|
+
}()
|
|
63
|
+
|
|
64
|
+
// Start worker pool
|
|
65
|
+
var workerWg sync.WaitGroup
|
|
66
|
+
for i := 0; i < config.Workers; i++ {
|
|
67
|
+
workerWg.Add(1)
|
|
68
|
+
go fetcher.worker(i, &workerWg)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Submit initial URL
|
|
72
|
+
fetcher.submitPage(config.BaseURL, 0)
|
|
73
|
+
|
|
74
|
+
// Close URL queue when all pages are processed
|
|
75
|
+
go func() {
|
|
76
|
+
workerWg.Wait()
|
|
77
|
+
close(fetcher.urlQueue)
|
|
78
|
+
}()
|
|
79
|
+
|
|
80
|
+
// Wait for all workers to complete
|
|
81
|
+
workerWg.Wait()
|
|
82
|
+
close(fetcher.resultsChan)
|
|
83
|
+
|
|
84
|
+
// Wait for results to be written
|
|
85
|
+
writeWg.Wait()
|
|
86
|
+
|
|
87
|
+
elapsed := time.Since(startTime)
|
|
88
|
+
pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
|
|
89
|
+
errors := atomic.LoadInt32(&fetcher.errorCount)
|
|
90
|
+
|
|
91
|
+
log.Printf("✅ Fetch completed!")
|
|
92
|
+
log.Printf(" 📊 Pages fetched: %d", pagesFetched)
|
|
93
|
+
log.Printf(" ⏱️ Time elapsed: %v", elapsed)
|
|
94
|
+
log.Printf(" 📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
|
|
95
|
+
log.Printf(" ❌ Errors: %d", errors)
|
|
96
|
+
|
|
97
|
+
// Generate LLM.txt if requested
|
|
98
|
+
if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
|
|
99
|
+
llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
|
|
100
|
+
if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
|
|
101
|
+
log.Printf("⚠️ Warning: Failed to generate llm.txt: %v", err)
|
|
102
|
+
} else {
|
|
103
|
+
log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return nil
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
|
|
111
|
+
func createOptimizedHTTPClient(workers int) *http.Client {
|
|
112
|
+
return &http.Client{
|
|
113
|
+
Timeout: 30 * time.Second,
|
|
114
|
+
Transport: &http.Transport{
|
|
115
|
+
MaxIdleConns: workers * 2,
|
|
116
|
+
MaxIdleConnsPerHost: workers,
|
|
117
|
+
IdleConnTimeout: 90 * time.Second,
|
|
118
|
+
DisableCompression: false,
|
|
119
|
+
DisableKeepAlives: false,
|
|
120
|
+
DialContext: (&net.Dialer{
|
|
121
|
+
Timeout: 10 * time.Second,
|
|
122
|
+
KeepAlive: 30 * time.Second,
|
|
123
|
+
}).DialContext,
|
|
124
|
+
TLSHandshakeTimeout: 10 * time.Second,
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// worker processes URLs from the submission queue
|
|
130
|
+
func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
|
|
131
|
+
defer wg.Done()
|
|
132
|
+
|
|
133
|
+
for url := range f.urlQueue {
|
|
134
|
+
select {
|
|
135
|
+
case <-f.ctx.Done():
|
|
136
|
+
return
|
|
137
|
+
default:
|
|
138
|
+
f.processURL(url, 0)
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// submitPage adds a URL to be fetched (with depth tracking)
|
|
144
|
+
func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
|
|
145
|
+
if depth > f.config.MaxDepth {
|
|
146
|
+
return
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Check if already visited using atomic operation
|
|
150
|
+
if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
|
|
151
|
+
return
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
select {
|
|
155
|
+
case f.urlQueue <- pageURL:
|
|
156
|
+
// Successfully queued
|
|
157
|
+
default:
|
|
158
|
+
// Queue full, skip this URL
|
|
159
|
+
log.Printf("⚠️ Queue full, skipping: %s", pageURL)
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// processURL fetches and processes a single URL
|
|
164
|
+
func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
|
|
165
|
+
atomic.AddInt32(&f.pageCount, 1)
|
|
166
|
+
|
|
167
|
+
startTime := time.Now()
|
|
168
|
+
|
|
169
|
+
// Validate URL
|
|
170
|
+
if err := isValidURL(pageURL); err != nil {
|
|
171
|
+
atomic.AddInt32(&f.errorCount, 1)
|
|
172
|
+
log.Printf("❌ Invalid URL %s: %v", pageURL, err)
|
|
173
|
+
return
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Fetch the page
|
|
177
|
+
resp, err := f.httpClient.Get(pageURL)
|
|
178
|
+
if err != nil {
|
|
179
|
+
atomic.AddInt32(&f.errorCount, 1)
|
|
180
|
+
log.Printf("❌ Error fetching %s: %v", pageURL, err)
|
|
181
|
+
return
|
|
182
|
+
}
|
|
183
|
+
defer resp.Body.Close()
|
|
184
|
+
|
|
185
|
+
if resp.StatusCode != 200 {
|
|
186
|
+
atomic.AddInt32(&f.errorCount, 1)
|
|
187
|
+
log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
|
|
188
|
+
return
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Parse HTML concurrently
|
|
192
|
+
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
193
|
+
if err != nil {
|
|
194
|
+
atomic.AddInt32(&f.errorCount, 1)
|
|
195
|
+
log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
|
|
196
|
+
return
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Extract content
|
|
200
|
+
content := cleanContent(doc)
|
|
201
|
+
if content == "" {
|
|
202
|
+
atomic.AddInt32(&f.errorCount, 1)
|
|
203
|
+
log.Printf("⚠️ No content found for %s", pageURL)
|
|
204
|
+
return
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Extract title
|
|
208
|
+
title := doc.Find("title").Text()
|
|
209
|
+
if title == "" {
|
|
210
|
+
title = pageURL
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Send result
|
|
214
|
+
f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
|
|
215
|
+
|
|
216
|
+
// Generate LLM.txt entry if requested
|
|
217
|
+
if f.config.GenerateLLMTxt {
|
|
218
|
+
cleanTitle := CleanTitle(title)
|
|
219
|
+
entryType := ClassifyPage(pageURL, cleanTitle)
|
|
220
|
+
description := ExtractDescription(content)
|
|
221
|
+
|
|
222
|
+
entry := LLMTxtEntry{
|
|
223
|
+
Type: entryType,
|
|
224
|
+
Title: cleanTitle,
|
|
225
|
+
URL: pageURL,
|
|
226
|
+
Description: description,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
f.llmMutex.Lock()
|
|
230
|
+
f.llmEntries = append(f.llmEntries, entry)
|
|
231
|
+
f.llmMutex.Unlock()
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Extract links for crawling (if depth allows)
|
|
235
|
+
if depth < f.config.MaxDepth {
|
|
236
|
+
f.extractAndSubmitLinks(doc, pageURL, depth+1)
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
elapsed := time.Since(startTime)
|
|
240
|
+
log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// extractAndSubmitLinks finds and queues all internal links
|
|
244
|
+
func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
|
|
245
|
+
base, err := url.Parse(baseURL)
|
|
246
|
+
if err != nil {
|
|
247
|
+
return
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
|
251
|
+
href, exists := s.Attr("href")
|
|
252
|
+
if !exists {
|
|
253
|
+
return
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Resolve relative URLs
|
|
257
|
+
resolvedURL, err := base.Parse(href)
|
|
258
|
+
if err != nil {
|
|
259
|
+
return
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Only follow same-domain links
|
|
263
|
+
if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
|
|
264
|
+
return
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Skip non-HTML resources
|
|
268
|
+
if isNonHTMLResource(resolvedURL.Path) {
|
|
269
|
+
return
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
f.submitPage(resolvedURL.String(), depth)
|
|
273
|
+
})
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// isNonHTMLResource checks if URL points to non-HTML resources
|
|
277
|
+
func isNonHTMLResource(path string) bool {
|
|
278
|
+
extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
|
|
279
|
+
pathLower := strings.ToLower(path)
|
|
280
|
+
|
|
281
|
+
for _, ext := range extensions {
|
|
282
|
+
if strings.HasSuffix(pathLower, ext) {
|
|
283
|
+
return true
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return false
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// writeResultsOptimized writes results to file efficiently
|
|
290
|
+
func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
|
|
291
|
+
file, err := os.Create(outputPath)
|
|
292
|
+
if err != nil {
|
|
293
|
+
return err
|
|
294
|
+
}
|
|
295
|
+
defer file.Close()
|
|
296
|
+
|
|
297
|
+
writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
|
|
298
|
+
defer writer.Flush()
|
|
299
|
+
|
|
300
|
+
// Write header
|
|
301
|
+
header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
|
|
302
|
+
writer.WriteString(header)
|
|
303
|
+
|
|
304
|
+
count := 0
|
|
305
|
+
for result := range resultsChan {
|
|
306
|
+
if strings.TrimSpace(result) != "" {
|
|
307
|
+
writer.WriteString(result)
|
|
308
|
+
count++
|
|
309
|
+
|
|
310
|
+
// Flush periodically to avoid memory buildup
|
|
311
|
+
if count%10 == 0 {
|
|
312
|
+
writer.Flush()
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return nil
|
|
318
|
+
}
|
package/pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "doc-fetch"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.1.1"
|
|
8
8
|
description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
|
package/setup.py
CHANGED
|
@@ -118,7 +118,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
118
118
|
|
|
119
119
|
setup(
|
|
120
120
|
name="doc-fetch",
|
|
121
|
-
version="1.
|
|
121
|
+
version="1.1.1",
|
|
122
122
|
author="AlphaTechini",
|
|
123
123
|
author_email="rehobothokoibu@gmail.com",
|
|
124
124
|
description="Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
|
|
Binary file
|