doc-fetch-cli 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ # Contributing to DocFetch
2
+
3
+ Thank you for your interest in contributing to DocFetch! 🎉 This guide will help you get started.
4
+
5
+ ## 📋 Quick Overview
6
+
7
+ 1. **Create an issue first** - Always start by opening an issue to discuss your change
8
+ 2. **Wait for feedback** - Maintainers will respond and provide guidance
9
+ 3. **Fork and develop** - Once approved, fork the repo and make your changes
10
+ 4. **Submit a PR** - Open a pull request referencing the issue
11
+ 5. **Review process** - Maintainers will review and provide feedback
12
+ 6. **Merge** - Once approved, your contribution will be merged!
13
+
14
+ ---
15
+
16
+ ## 🐛 Before You Start: Create an Issue
17
+
18
+ **⚠️ IMPORTANT: Always create an issue before submitting a PR!**
19
+
20
+ This helps us:
21
+ - Avoid duplicate work
22
+ - Discuss the best approach
23
+ - Ensure your contribution aligns with project goals
24
+ - Get early feedback from maintainers
25
+
26
+ ### Types of Issues
27
+
28
+ #### 🐞 Bug Reports
29
+ Include:
30
+ - Clear description of the bug
31
+ - Steps to reproduce
32
+ - Expected vs actual behavior
33
+ - Environment details (OS, Go version, DocFetch version)
34
+ - Sample command that triggers the bug
35
+ - Error messages or logs
36
+
37
+ #### ✨ Feature Requests
38
+ Include:
39
+ - Clear description of the feature
40
+ - Use case / problem it solves
41
+ - Example usage
42
+ - Any relevant links or references
43
+
44
+ #### 📝 Documentation Improvements
45
+ Include:
46
+ - What needs improvement
47
+ - Why it's needed
48
+ - Suggested changes
49
+
50
+ #### ⚡ Performance Improvements
51
+ Include:
52
+ - Current performance metrics
53
+ - Proposed improvements
54
+ - Benchmark results (if available)
55
+
56
+ ---
57
+
58
+ ## 🚀 Development Setup
59
+
60
+ ### Prerequisites
61
+
62
+ - Go 1.21 or later
63
+ - Git
64
+ - Make (optional, for running tests)
65
+
66
+ ### Fork and Clone
67
+
68
+ ```bash
69
+ # Fork the repository on GitHub, then:
70
+ git clone https://github.com/YOUR_USERNAME/doc-fetch.git
71
+ cd doc-fetch
72
+
73
+ # Add upstream remote
74
+ git remote add upstream https://github.com/AlphaTechini/doc-fetch.git
75
+ ```
76
+
77
+ ### Build from Source
78
+
79
+ ```bash
80
+ # Build the binary
81
+ go build -o doc-fetch ./cmd/docfetch
82
+
83
+ # Test it works
84
+ ./doc-fetch --help
85
+ ```
86
+
87
+ ### Run Tests
88
+
89
+ ```bash
90
+ # Run all tests
91
+ go test ./...
92
+
93
+ # Run tests with coverage
94
+ go test -cover ./...
95
+
96
+ # Run specific package tests
97
+ go test ./pkg/fetcher/...
98
+ ```
99
+
100
+ ---
101
+
102
+ ## 💻 Making Changes
103
+
104
+ ### Branch Naming
105
+
106
+ Use descriptive branch names:
107
+ - `fix/content-extraction-bug`
108
+ - `feat/add-pdf-support`
109
+ - `docs/update-readme-examples`
110
+ - `perf/improve-concurrent-fetching`
111
+
112
+ ### Code Style
113
+
114
+ Follow Go best practices:
115
+ - Run `go fmt` before committing
116
+ - Run `go vet` to catch issues
117
+ - Write clear, concise comments
118
+ - Keep functions small and focused
119
+ - Use meaningful variable names
120
+
121
+ ### Testing Requirements
122
+
123
+ - Add tests for new features
124
+ - Ensure existing tests pass
125
+ - Include edge cases
126
+ - Test with real documentation sites
127
+
128
+ Example test:
129
+ ```go
130
+ func TestContentExtraction(t *testing.T) {
131
+ doc := createTestDocument()
132
+ content := cleanContent(doc)
133
+
134
+ if len(content) == 0 {
135
+ t.Error("Expected content to be extracted")
136
+ }
137
+
138
+ if !strings.Contains(content, "expected text") {
139
+ t.Error("Expected content to contain specific text")
140
+ }
141
+ }
142
+ ```
143
+
144
+ ---
145
+
146
+ ## 📤 Submitting a Pull Request
147
+
148
+ ### PR Checklist
149
+
150
+ Before submitting your PR, ensure:
151
+
152
+ - [ ] You created an issue first and referenced it in the PR
153
+ - [ ] Your code follows Go style guidelines
154
+ - [ ] All tests pass (`go test ./...`)
155
+ - [ ] You've added tests for new functionality
156
+ - [ ] You've updated documentation if needed
157
+ - [ ] Your commit messages are clear and descriptive
158
+ - [ ] You've rebased on the latest main branch
159
+
160
+ ### PR Template
161
+
162
+ When creating your PR, include:
163
+
164
+ ```markdown
165
+ ## Description
166
+ Brief description of changes
167
+
168
+ ## Related Issue
169
+ Fixes #123 (or "Related to #123")
170
+
171
+ ## Type of Change
172
+ - [ ] Bug fix
173
+ - [ ] New feature
174
+ - [ ] Breaking change
175
+ - [ ] Documentation update
176
+ - [ ] Performance improvement
177
+ - [ ] Refactoring
178
+
179
+ ## Testing
180
+ Describe how you tested this:
181
+ - [ ] Unit tests added/updated
182
+ - [ ] Manual testing with real docs
183
+ - [ ] Tested on: [list platforms]
184
+
185
+ ## Example Usage
186
+ Show example command and output if applicable
187
+
188
+ ## Checklist
189
+ - [ ] Code follows project guidelines
190
+ - [ ] Self-review completed
191
+ - [ ] Comments added where needed
192
+ - [ ] Tests pass locally
193
+ ```
194
+
195
+ ---
196
+
197
+ ## 🔍 Review Process
198
+
199
+ 1. **Automated Checks**: CI runs tests and linting
200
+ 2. **Maintainer Review**: At least one maintainer reviews
201
+ 3. **Feedback**: You may be asked to make changes
202
+ 4. **Approval**: Once approved, PR is merged
203
+ 5. **Release**: Changes included in next release
204
+
205
+ Typical timeline: 3-7 days for review
206
+
207
+ ---
208
+
209
+ ## 📖 Contribution Ideas
210
+
211
+ Looking for ways to contribute? Here are some ideas:
212
+
213
+ ### Easy Wins
214
+ - Fix typos in documentation
215
+ - Add more examples to README
216
+ - Improve error messages
217
+ - Add unit tests for existing code
218
+
219
+ ### Intermediate
220
+ - Add support for new documentation site formats
221
+ - Improve content extraction selectors
222
+ - Add progress indicators
223
+ - Enhance LLM.txt generation
224
+
225
+ ### Advanced
226
+ - Add PDF export support
227
+ - Implement incremental updates
228
+ - Add authentication support for private docs
229
+ - Create plugin system for custom extractors
230
+
231
+ ---
232
+
233
+ ## 🤝 Community Guidelines
234
+
235
+ ### Be Respectful
236
+ - Treat everyone with respect
237
+ - Welcome newcomers
238
+ - Provide constructive feedback
239
+ - Assume good intentions
240
+
241
+ ### Communication
242
+ - Use clear, concise language
243
+ - Explain your reasoning
244
+ - Ask questions if unsure
245
+ - Respond to feedback promptly
246
+
247
+ ### Collaboration
248
+ - Work with maintainers, not against them
249
+ - Be open to suggestions
250
+ - Help other contributors
251
+ - Share knowledge
252
+
253
+ ---
254
+
255
+ ## 📜 License
256
+
257
+ By contributing to DocFetch, you agree that your contributions will be licensed under the MIT License.
258
+
259
+ ---
260
+
261
+ ## ❓ Questions?
262
+
263
+ - **General questions**: Open a discussion on GitHub
264
+ - **Bug reports**: Create an issue
265
+ - **Feature requests**: Create an issue
266
+ - **Quick questions**: Check existing issues/discussions first
267
+
268
+ ---
269
+
270
+ ## 🙏 Thank You!
271
+
272
+ Your contributions make DocFetch better for everyone. Whether it's a typo fix, a new feature, or better documentation - we appreciate your time and effort!
273
+
274
+ Happy coding! 🚀
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Post-install script for doc-fetch-cli
4
+ * Checks if global bin directory is in PATH and provides helpful instructions
5
+ */
6
+
7
+ const { execSync } = require('child_process');
8
+ const path = require('path');
9
+ const os = require('os');
10
+ const fs = require('fs');
11
+
12
+ console.log('🎉 DocFetch CLI installed successfully!\n');
13
+
14
+ // Get npm global prefix
15
+ let globalPrefix;
16
+ try {
17
+ globalPrefix = execSync('npm config get prefix', { encoding: 'utf8' }).trim();
18
+ } catch (error) {
19
+ console.error('⚠️ Could not determine npm global prefix');
20
+ globalPrefix = null;
21
+ }
22
+
23
+ if (globalPrefix) {
24
+ const binDir = path.join(globalPrefix, 'bin');
25
+ const isWindows = os.platform() === 'win32';
26
+
27
+ console.log(`📦 Installed to: ${binDir}\n`);
28
+
29
+ // Check if bin directory is in PATH
30
+ const pathEnv = process.env.PATH || '';
31
+ const pathDirs = pathEnv.split(isWindows ? ';' : ':');
32
+ const isInPath = pathDirs.some(dir => path.resolve(dir) === path.resolve(binDir));
33
+
34
+ if (!isInPath) {
35
+ console.log('⚠️ WARNING: Global bin directory is not in your PATH!\n');
36
+ console.log('To use doc-fetch-cli, add this directory to your PATH:\n');
37
+ console.log(` ${binDir}\n`);
38
+
39
+ // Provide platform-specific instructions
40
+ const shell = process.env.SHELL || '/bin/bash';
41
+ const isZsh = shell.includes('zsh');
42
+ const isBash = shell.includes('bash');
43
+
44
+ console.log('Quick fix:\n');
45
+
46
+ if (isWindows) {
47
+ console.log('1. Open System Properties → Environment Variables');
48
+ console.log('2. Edit PATH variable');
49
+ console.log('3. Add this path:');
50
+ console.log(` ${binDir}`);
51
+ console.log('4. Restart your terminal\n');
52
+ } else if (isZsh) {
53
+ console.log('Add this to your ~/.zshrc:');
54
+ console.log(` export PATH="${binDir}:$PATH"\n`);
55
+ console.log('Then run: source ~/.zshrc\n');
56
+ } else if (isBash) {
57
+ console.log('Add this to your ~/.bashrc or ~/.bash_profile:');
58
+ console.log(` export PATH="${binDir}:$PATH"\n`);
59
+ console.log('Then run: source ~/.bashrc\n');
60
+ }
61
+
62
+ console.log('Alternative: Use npx without installing globally\n');
63
+ console.log(' npx doc-fetch-cli --url https://docs.example.com --output docs.md\n');
64
+ } else {
65
+ console.log('✅ Global bin directory is in your PATH\n');
66
+ console.log('You can now use doc-fetch-cli!\n');
67
+ console.log('Example usage:');
68
+ console.log(' doc-fetch --url https://docs.python.org/3 --output docs.md --llm-txt\n');
69
+
70
+ // Test if the command works
71
+ try {
72
+ execSync('doc-fetch --version', { encoding: 'utf8', stdio: 'pipe' });
73
+ console.log('✅ Command verified working!\n');
74
+ } catch (error) {
75
+ console.log('⚠️ Command not found in current shell session.\n');
76
+ console.log('Try running: hash -r (to clear command cache)\n');
77
+ console.log('Or restart your terminal.\n');
78
+ }
79
+ }
80
+ }
81
+
82
+ console.log('📚 Documentation: https://github.com/AlphaTechini/doc-fetch\n');
83
+ console.log('✨ Pro tip: Use --llm-txt flag to generate AI-friendly index files!\n');
@@ -36,7 +36,8 @@ func main() {
36
36
  log.Fatalf("Configuration error: %v", err)
37
37
  }
38
38
 
39
- err := fetcher.Run(config)
39
+ // Use optimized high-performance fetcher
40
+ err := fetcher.RunOptimized(config)
40
41
  if err != nil {
41
42
  log.Fatalf("Failed to fetch documentation: %v", err)
42
43
  }
Binary file
Binary file
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-fetch
3
- Version: 1.0.1
3
+ Version: 1.1.1
4
4
  Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
5
5
  Home-page: https://github.com/AlphaTechini/doc-fetch
6
6
  Author: AlphaTechini
@@ -1,7 +1,18 @@
1
+ CONTRIBUTING.md
1
2
  README.md
3
+ SECURITY.md
4
+ doc-fetch
5
+ doc-fetch_darwin_amd64
6
+ doc-fetch_linux_amd64
7
+ doc-fetch_windows_amd64.exe
2
8
  go.mod
9
+ go.sum
10
+ package.json
3
11
  pyproject.toml
4
12
  setup.py
13
+ bin/doc-fetch.js
14
+ bin/install.js
15
+ bin/postinstall.js
5
16
  cmd/docfetch/main.go
6
17
  doc_fetch/__init__.py
7
18
  doc_fetch/__main__.py
@@ -14,6 +25,12 @@ doc_fetch.egg-info/not-zip-safe
14
25
  doc_fetch.egg-info/top_level.txt
15
26
  docs/usage.md
16
27
  examples/golang-example.sh
28
+ pkg/fetcher/classifier.go
29
+ pkg/fetcher/describer.go
30
+ pkg/fetcher/extract_nav.go
17
31
  pkg/fetcher/fetcher.go
32
+ pkg/fetcher/fetcher_optimized.go
18
33
  pkg/fetcher/html2md.go
34
+ pkg/fetcher/llmtxt.go
35
+ pkg/fetcher/validator.go
19
36
  pkg/fetcher/writer.go
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "doc-fetch-cli",
3
- "version": "1.0.2",
3
+ "version": "1.1.1",
4
4
  "description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
5
5
  "bin": {
6
6
  "doc-fetch": "./bin/doc-fetch.js"
7
7
  },
8
8
  "scripts": {
9
- "postinstall": "node ./bin/install.js"
9
+ "postinstall": "node ./bin/postinstall.js"
10
10
  },
11
11
  "repository": {
12
12
  "type": "git",
@@ -0,0 +1,163 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "fmt"
5
+ "strings"
6
+
7
+ "github.com/PuerkitoBio/goquery"
8
+ )
9
+
10
+ // ExtractNavigationStructure extracts nav elements with h2/h3, ul/li, and hrefs
11
+ func ExtractNavigationStructure(doc *goquery.Document) string {
12
+ var result strings.Builder
13
+
14
+ result.WriteString("# Navigation Structure\n\n")
15
+
16
+ // Find all nav elements
17
+ doc.Find("nav").Each(func(i int, nav *goquery.Selection) {
18
+ result.WriteString(fmt.Sprintf("## Navigation Block %d\n\n", i+1))
19
+
20
+ // Look for headings in nav
21
+ nav.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
22
+ tagName := h.Get(0).Data
23
+ text := strings.TrimSpace(h.Text())
24
+ if text != "" {
25
+ result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
26
+ }
27
+
28
+ // Find ul under this heading
29
+ h.NextFiltered("ul").Each(func(k int, ul *goquery.Selection) {
30
+ result.WriteString(extractListWithLinks(ul, 1))
31
+ })
32
+ })
33
+
34
+ // Also find ul directly in nav
35
+ nav.ChildrenFiltered("ul").Each(func(k int, ul *goquery.Selection) {
36
+ result.WriteString(extractListWithLinks(ul, 1))
37
+ })
38
+
39
+ result.WriteString("---\n\n")
40
+ })
41
+
42
+ // Also look for elements with navigation-related classes/ids
43
+ navSelectors := []string{
44
+ "[class*='nav']",
45
+ "[id*='nav']",
46
+ "[class*='menu']",
47
+ "[id*='menu']",
48
+ "[role='navigation']",
49
+ ".toc",
50
+ "#toc",
51
+ "[class*='toc']",
52
+ "[id*='toc']",
53
+ }
54
+
55
+ for _, selector := range navSelectors {
56
+ doc.Find(selector).Each(func(i int, s *goquery.Selection) {
57
+ // Skip if already processed as nav element
58
+ if s.Parent().Is("nav") {
59
+ return
60
+ }
61
+
62
+ result.WriteString(fmt.Sprintf("## Navigation Element (matched: %s)\n\n", selector))
63
+
64
+ // Extract headings
65
+ s.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
66
+ tagName := h.Get(0).Data
67
+ text := strings.TrimSpace(h.Text())
68
+ if text != "" {
69
+ result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
70
+ }
71
+ })
72
+
73
+ // Extract lists with links
74
+ s.Find("ul, ol").Each(func(k int, list *goquery.Selection) {
75
+ result.WriteString(extractListWithLinks(list, 1))
76
+ })
77
+
78
+ result.WriteString("---\n\n")
79
+ })
80
+ }
81
+
82
+ return result.String()
83
+ }
84
+
85
+ // extractListWithLinks extracts list items with their href attributes
86
+ func extractListWithLinks(list *goquery.Selection, indentLevel int) string {
87
+ var result strings.Builder
88
+
89
+ indent := strings.Repeat(" ", indentLevel)
90
+
91
+ list.Find("> li").Each(func(i int, li *goquery.Selection) {
92
+ // Get the text
93
+ text := strings.TrimSpace(li.Text())
94
+
95
+ // Find any links in this li
96
+ li.Find("a[href]").Each(func(j int, a *goquery.Selection) {
97
+ href, exists := a.Attr("href")
98
+ linkText := strings.TrimSpace(a.Text())
99
+ if exists && href != "" {
100
+ result.WriteString(fmt.Sprintf("%s- [%s](%s)\n", indent, linkText, href))
101
+ }
102
+ })
103
+
104
+ // If no links found, just add the text
105
+ if li.Find("a[href]").Length() == 0 && text != "" {
106
+ result.WriteString(fmt.Sprintf("%s- %s\n", indent, text))
107
+ }
108
+
109
+ // Recursively process nested lists
110
+ li.ChildrenFiltered("ul, ol").Each(func(k int, nested *goquery.Selection) {
111
+ result.WriteString(extractListWithLinks(nested, indentLevel+1))
112
+ })
113
+ })
114
+
115
+ return result.String()
116
+ }
117
+
118
+ // ExtractAllLinks extracts all links from the page with context
119
+ func ExtractAllLinks(doc *goquery.Document, baseURL string) string {
120
+ var result strings.Builder
121
+
122
+ result.WriteString("# All Links Found\n\n")
123
+
124
+ linksFound := 0
125
+
126
+ // Group links by section
127
+ doc.Find("section, article, div[class*='content'], div[id*='content']").Each(func(i int, section *goquery.Selection) {
128
+ sectionLinks := 0
129
+ var sectionResult strings.Builder
130
+
131
+ // Get section title
132
+ title := ""
133
+ section.Find("h1, h2, h3").First().Each(func(j int, h *goquery.Selection) {
134
+ title = strings.TrimSpace(h.Text())
135
+ })
136
+
137
+ if title == "" {
138
+ title = fmt.Sprintf("Section %d", i+1)
139
+ }
140
+
141
+ sectionResult.WriteString(fmt.Sprintf("## %s\n\n", title))
142
+
143
+ // Find all links in this section
144
+ section.Find("a[href]").Each(func(j int, a *goquery.Selection) {
145
+ href, exists := a.Attr("href")
146
+ text := strings.TrimSpace(a.Text())
147
+ if exists && href != "" && text != "" {
148
+ sectionResult.WriteString(fmt.Sprintf("- [%s](%s)\n", text, href))
149
+ sectionLinks++
150
+ linksFound++
151
+ }
152
+ })
153
+
154
+ if sectionLinks > 0 {
155
+ result.WriteString(sectionResult.String())
156
+ result.WriteString("\n")
157
+ }
158
+ })
159
+
160
+ result.WriteString(fmt.Sprintf("\n**Total links found: %d**\n", linksFound))
161
+
162
+ return result.String()
163
+ }
@@ -195,53 +195,136 @@ func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mu
195
195
  }
196
196
  }
197
197
 
198
- // cleanContent extracts and cleans the main documentation content
198
+ // cleanContent extracts and cleans the main documentation content using multiple strategies
199
199
  func cleanContent(doc *goquery.Document) string {
200
- // Common selectors for documentation content
201
- selectors := []string{
200
+ // Strategy 1: Try semantic HTML5 elements (most reliable)
201
+ semanticSelectors := []string{
202
202
  "main",
203
203
  "article",
204
+ "[role='main']",
205
+ "[role='article']",
206
+ }
207
+
208
+ for _, selector := range semanticSelectors {
209
+ if el := doc.Find(selector); el.Length() > 0 {
210
+ content := extractTextContent(el)
211
+ if len(content) > 200 { // Minimum viable content
212
+ return content
213
+ }
214
+ }
215
+ }
216
+
217
+ // Strategy 2: Try common class/id patterns
218
+ classSelectors := []string{
204
219
  ".content",
205
- ".docs-content",
220
+ ".docs-content",
206
221
  "#main-content",
207
222
  ".documentation",
208
223
  ".post-content",
209
224
  ".markdown-body",
210
225
  ".content-wrapper",
211
226
  ".doc-content",
227
+ ".document",
228
+ ".entry-content",
229
+ ".page-content",
230
+ ".article-content",
231
+ "[class*='content']",
232
+ "[class*='docs']",
233
+ "[class*='document']",
234
+ "[id*='content']",
235
+ "[id*='main']",
212
236
  }
213
237
 
214
- // Try each selector
215
- for _, selector := range selectors {
238
+ for _, selector := range classSelectors {
216
239
  if el := doc.Find(selector); el.Length() > 0 {
217
- // Remove unwanted elements
218
- el.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
219
-
220
- // Convert to HTML and then clean
221
- htmlContent, err := el.Html()
222
- if err != nil {
223
- continue
240
+ content := extractTextContent(el)
241
+ if len(content) > 200 {
242
+ return content
224
243
  }
244
+ }
245
+ }
246
+
247
+ // Strategy 3: Look for sections with high text density
248
+ var bestSection *goquery.Selection
249
+ maxTextLen := 0
250
+
251
+ doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
252
+ text := strings.TrimSpace(s.Text())
253
+ if len(text) > maxTextLen {
254
+ // Check if this section has more text than child elements
255
+ childText := 0
256
+ s.Children().Each(func(j int, c *goquery.Selection) {
257
+ childText += len(strings.TrimSpace(c.Text()))
258
+ })
225
259
 
226
- // Basic HTML cleaning
227
- cleaned := cleanHTML(htmlContent)
228
- if cleaned != "" {
229
- return cleaned
260
+ // If parent has significantly more text, it's likely the main content
261
+ if len(text) > childText + (childText/2) && len(text) > 500 {
262
+ maxTextLen = len(text)
263
+ bestSection = s
230
264
  }
231
265
  }
266
+ })
267
+
268
+ if bestSection != nil {
269
+ content := extractTextContent(bestSection)
270
+ if len(content) > 200 {
271
+ return content
272
+ }
232
273
  }
233
274
 
234
- // Fallback: try to get body content
275
+ // Strategy 4: Fallback to body with aggressive cleaning
235
276
  body := doc.Find("body")
236
277
  if body.Length() > 0 {
237
- body.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
278
+ // Remove all non-content elements aggressively
279
+ body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
280
+
281
+ // Find the largest remaining container
282
+ var largest *goquery.Selection
283
+ largestSize := 0
284
+
285
+ body.Find("*").Each(func(i int, s *goquery.Selection) {
286
+ text := strings.TrimSpace(s.Text())
287
+ if len(text) > largestSize && s.Children().Length() < 50 {
288
+ largestSize = len(text)
289
+ largest = s
290
+ }
291
+ })
292
+
293
+ if largest != nil {
294
+ content := extractTextContent(largest)
295
+ if len(content) > 200 {
296
+ return content
297
+ }
298
+ }
299
+
300
+ // Last resort: entire body
238
301
  htmlContent, _ := body.Html()
239
- return cleanHTML(htmlContent)
302
+ cleaned := cleanHTML(htmlContent)
303
+ if len(cleaned) > 200 {
304
+ return cleaned
305
+ }
240
306
  }
241
307
 
242
308
  return ""
243
309
  }
244
310
 
311
+ // extractTextContent extracts and cleans text from a selection
312
+ func extractTextContent(sel *goquery.Selection) string {
313
+ // Clone the selection to avoid modifying original
314
+ clone := sel.Clone()
315
+
316
+ // Remove unwanted elements
317
+ clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
318
+
319
+ // Get HTML and convert to clean text
320
+ htmlContent, err := clone.Html()
321
+ if err != nil {
322
+ return ""
323
+ }
324
+
325
+ return cleanHTML(htmlContent)
326
+ }
327
+
245
328
  // cleanHTML performs basic HTML cleaning
246
329
  func cleanHTML(htmlStr string) string {
247
330
  // Parse and extract text content while preserving structure
@@ -0,0 +1,318 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "bufio"
5
+ "context"
6
+ "fmt"
7
+ "log"
8
+ "net"
9
+ "net/http"
10
+ "net/url"
11
+ "os"
12
+ "strings"
13
+ "sync"
14
+ "sync/atomic"
15
+ "time"
16
+
17
+ "github.com/PuerkitoBio/goquery"
18
+ )
19
+
20
+ // OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
21
+ type OptimizedFetcher struct {
22
+ config Config
23
+ httpClient *http.Client
24
+ urlQueue chan string
25
+ visited sync.Map // Concurrent map instead of mutex-protected map
26
+ resultsChan chan string
27
+ llmEntries []LLMTxtEntry
28
+ llmMutex sync.Mutex
29
+ pageCount int32
30
+ errorCount int32
31
+ ctx context.Context
32
+ cancel context.CancelFunc
33
+ }
34
+
35
+ // RunOptimized executes documentation fetching with maximum concurrency
36
+ func RunOptimized(config Config) error {
37
+ if err := validateConfig(&config); err != nil {
38
+ return fmt.Errorf("invalid configuration: %w", err)
39
+ }
40
+
41
+ log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
42
+ log.Printf(" Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
43
+
44
+ fetcher := &OptimizedFetcher{
45
+ config: config,
46
+ urlQueue: make(chan string, config.Workers*100), // Large buffer for URLs
47
+ resultsChan: make(chan string, config.Workers*10), // Larger buffer
48
+ httpClient: createOptimizedHTTPClient(config.Workers),
49
+ }
50
+
51
+ fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
52
+ defer fetcher.cancel()
53
+
54
+ startTime := time.Now()
55
+
56
+ // Start result writer in background
57
+ var writeWg sync.WaitGroup
58
+ writeWg.Add(1)
59
+ go func() {
60
+ defer writeWg.Add(-1)
61
+ writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
62
+ }()
63
+
64
+ // Start worker pool
65
+ var workerWg sync.WaitGroup
66
+ for i := 0; i < config.Workers; i++ {
67
+ workerWg.Add(1)
68
+ go fetcher.worker(i, &workerWg)
69
+ }
70
+
71
+ // Submit initial URL
72
+ fetcher.submitPage(config.BaseURL, 0)
73
+
74
+ // Close URL queue when all pages are processed
75
+ go func() {
76
+ workerWg.Wait()
77
+ close(fetcher.urlQueue)
78
+ }()
79
+
80
+ // Wait for all workers to complete
81
+ workerWg.Wait()
82
+ close(fetcher.resultsChan)
83
+
84
+ // Wait for results to be written
85
+ writeWg.Wait()
86
+
87
+ elapsed := time.Since(startTime)
88
+ pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
89
+ errors := atomic.LoadInt32(&fetcher.errorCount)
90
+
91
+ log.Printf("✅ Fetch completed!")
92
+ log.Printf(" 📊 Pages fetched: %d", pagesFetched)
93
+ log.Printf(" ⏱️ Time elapsed: %v", elapsed)
94
+ log.Printf(" 📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
95
+ log.Printf(" ❌ Errors: %d", errors)
96
+
97
+ // Generate LLM.txt if requested
98
+ if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
99
+ llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
100
+ if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
101
+ log.Printf("⚠️ Warning: Failed to generate llm.txt: %v", err)
102
+ } else {
103
+ log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
104
+ }
105
+ }
106
+
107
+ return nil
108
+ }
109
+
110
+ // createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
111
+ func createOptimizedHTTPClient(workers int) *http.Client {
112
+ return &http.Client{
113
+ Timeout: 30 * time.Second,
114
+ Transport: &http.Transport{
115
+ MaxIdleConns: workers * 2,
116
+ MaxIdleConnsPerHost: workers,
117
+ IdleConnTimeout: 90 * time.Second,
118
+ DisableCompression: false,
119
+ DisableKeepAlives: false,
120
+ DialContext: (&net.Dialer{
121
+ Timeout: 10 * time.Second,
122
+ KeepAlive: 30 * time.Second,
123
+ }).DialContext,
124
+ TLSHandshakeTimeout: 10 * time.Second,
125
+ },
126
+ }
127
+ }
128
+
129
+ // worker processes URLs from the submission queue
130
+ func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
131
+ defer wg.Done()
132
+
133
+ for url := range f.urlQueue {
134
+ select {
135
+ case <-f.ctx.Done():
136
+ return
137
+ default:
138
+ f.processURL(url, 0)
139
+ }
140
+ }
141
+ }
142
+
143
+ // submitPage adds a URL to be fetched (with depth tracking)
144
+ func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
145
+ if depth > f.config.MaxDepth {
146
+ return
147
+ }
148
+
149
+ // Check if already visited using atomic operation
150
+ if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
151
+ return
152
+ }
153
+
154
+ select {
155
+ case f.urlQueue <- pageURL:
156
+ // Successfully queued
157
+ default:
158
+ // Queue full, skip this URL
159
+ log.Printf("⚠️ Queue full, skipping: %s", pageURL)
160
+ }
161
+ }
162
+
163
+ // processURL fetches and processes a single URL
164
+ func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
165
+ atomic.AddInt32(&f.pageCount, 1)
166
+
167
+ startTime := time.Now()
168
+
169
+ // Validate URL
170
+ if err := isValidURL(pageURL); err != nil {
171
+ atomic.AddInt32(&f.errorCount, 1)
172
+ log.Printf("❌ Invalid URL %s: %v", pageURL, err)
173
+ return
174
+ }
175
+
176
+ // Fetch the page
177
+ resp, err := f.httpClient.Get(pageURL)
178
+ if err != nil {
179
+ atomic.AddInt32(&f.errorCount, 1)
180
+ log.Printf("❌ Error fetching %s: %v", pageURL, err)
181
+ return
182
+ }
183
+ defer resp.Body.Close()
184
+
185
+ if resp.StatusCode != 200 {
186
+ atomic.AddInt32(&f.errorCount, 1)
187
+ log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
188
+ return
189
+ }
190
+
191
+ // Parse HTML concurrently
192
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
193
+ if err != nil {
194
+ atomic.AddInt32(&f.errorCount, 1)
195
+ log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
196
+ return
197
+ }
198
+
199
+ // Extract content
200
+ content := cleanContent(doc)
201
+ if content == "" {
202
+ atomic.AddInt32(&f.errorCount, 1)
203
+ log.Printf("⚠️ No content found for %s", pageURL)
204
+ return
205
+ }
206
+
207
+ // Extract title
208
+ title := doc.Find("title").Text()
209
+ if title == "" {
210
+ title = pageURL
211
+ }
212
+
213
+ // Send result
214
+ f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
215
+
216
+ // Generate LLM.txt entry if requested
217
+ if f.config.GenerateLLMTxt {
218
+ cleanTitle := CleanTitle(title)
219
+ entryType := ClassifyPage(pageURL, cleanTitle)
220
+ description := ExtractDescription(content)
221
+
222
+ entry := LLMTxtEntry{
223
+ Type: entryType,
224
+ Title: cleanTitle,
225
+ URL: pageURL,
226
+ Description: description,
227
+ }
228
+
229
+ f.llmMutex.Lock()
230
+ f.llmEntries = append(f.llmEntries, entry)
231
+ f.llmMutex.Unlock()
232
+ }
233
+
234
+ // Extract links for crawling (if depth allows)
235
+ if depth < f.config.MaxDepth {
236
+ f.extractAndSubmitLinks(doc, pageURL, depth+1)
237
+ }
238
+
239
+ elapsed := time.Since(startTime)
240
+ log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
241
+ }
242
+
243
+ // extractAndSubmitLinks finds and queues all internal links
244
+ func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
245
+ base, err := url.Parse(baseURL)
246
+ if err != nil {
247
+ return
248
+ }
249
+
250
+ doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
251
+ href, exists := s.Attr("href")
252
+ if !exists {
253
+ return
254
+ }
255
+
256
+ // Resolve relative URLs
257
+ resolvedURL, err := base.Parse(href)
258
+ if err != nil {
259
+ return
260
+ }
261
+
262
+ // Only follow same-domain links
263
+ if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
264
+ return
265
+ }
266
+
267
+ // Skip non-HTML resources
268
+ if isNonHTMLResource(resolvedURL.Path) {
269
+ return
270
+ }
271
+
272
+ f.submitPage(resolvedURL.String(), depth)
273
+ })
274
+ }
275
+
276
+ // isNonHTMLResource checks if URL points to non-HTML resources
277
+ func isNonHTMLResource(path string) bool {
278
+ extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
279
+ pathLower := strings.ToLower(path)
280
+
281
+ for _, ext := range extensions {
282
+ if strings.HasSuffix(pathLower, ext) {
283
+ return true
284
+ }
285
+ }
286
+ return false
287
+ }
288
+
289
+ // writeResultsOptimized writes results to file efficiently
290
+ func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
291
+ file, err := os.Create(outputPath)
292
+ if err != nil {
293
+ return err
294
+ }
295
+ defer file.Close()
296
+
297
+ writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
298
+ defer writer.Flush()
299
+
300
+ // Write header
301
+ header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
302
+ writer.WriteString(header)
303
+
304
+ count := 0
305
+ for result := range resultsChan {
306
+ if strings.TrimSpace(result) != "" {
307
+ writer.WriteString(result)
308
+ count++
309
+
310
+ // Flush periodically to avoid memory buildup
311
+ if count%10 == 0 {
312
+ writer.Flush()
313
+ }
314
+ }
315
+ }
316
+
317
+ return nil
318
+ }
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "doc-fetch"
7
- version = "1.0.1"
7
+ version = "1.1.1"
8
8
  description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
9
9
  readme = "README.md"
10
10
  authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
package/setup.py CHANGED
@@ -118,7 +118,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
118
118
 
119
119
  setup(
120
120
  name="doc-fetch",
121
- version="1.0.1",
121
+ version="1.1.1",
122
122
  author="AlphaTechini",
123
123
  author_email="rehobothokoibu@gmail.com",
124
124
  description="Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
Binary file