doc-fetch-cli 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ # Contributing to DocFetch
2
+
3
+ Thank you for your interest in contributing to DocFetch! 🎉 This guide will help you get started.
4
+
5
+ ## 📋 Quick Overview
6
+
7
+ 1. **Create an issue first** - Always start by opening an issue to discuss your change
8
+ 2. **Wait for feedback** - Maintainers will respond and provide guidance
9
+ 3. **Fork and develop** - Once approved, fork the repo and make your changes
10
+ 4. **Submit a PR** - Open a pull request referencing the issue
11
+ 5. **Review process** - Maintainers will review and provide feedback
12
+ 6. **Merge** - Once approved, your contribution will be merged!
13
+
14
+ ---
15
+
16
+ ## 🐛 Before You Start: Create an Issue
17
+
18
+ **⚠️ IMPORTANT: Always create an issue before submitting a PR!**
19
+
20
+ This helps us:
21
+ - Avoid duplicate work
22
+ - Discuss the best approach
23
+ - Ensure your contribution aligns with project goals
24
+ - Get early feedback from maintainers
25
+
26
+ ### Types of Issues
27
+
28
+ #### 🐞 Bug Reports
29
+ Include:
30
+ - Clear description of the bug
31
+ - Steps to reproduce
32
+ - Expected vs actual behavior
33
+ - Environment details (OS, Go version, DocFetch version)
34
+ - Sample command that triggers the bug
35
+ - Error messages or logs
36
+
37
+ #### ✨ Feature Requests
38
+ Include:
39
+ - Clear description of the feature
40
+ - Use case / problem it solves
41
+ - Example usage
42
+ - Any relevant links or references
43
+
44
+ #### 📝 Documentation Improvements
45
+ Include:
46
+ - What needs improvement
47
+ - Why it's needed
48
+ - Suggested changes
49
+
50
+ #### ⚡ Performance Improvements
51
+ Include:
52
+ - Current performance metrics
53
+ - Proposed improvements
54
+ - Benchmark results (if available)
55
+
56
+ ---
57
+
58
+ ## 🚀 Development Setup
59
+
60
+ ### Prerequisites
61
+
62
+ - Go 1.21 or later
63
+ - Git
64
+ - Make (optional, for running tests)
65
+
66
+ ### Fork and Clone
67
+
68
+ ```bash
69
+ # Fork the repository on GitHub, then:
70
+ git clone https://github.com/YOUR_USERNAME/doc-fetch.git
71
+ cd doc-fetch
72
+
73
+ # Add upstream remote
74
+ git remote add upstream https://github.com/AlphaTechini/doc-fetch.git
75
+ ```
76
+
77
+ ### Build from Source
78
+
79
+ ```bash
80
+ # Build the binary
81
+ go build -o doc-fetch ./cmd/docfetch
82
+
83
+ # Test it works
84
+ ./doc-fetch --help
85
+ ```
86
+
87
+ ### Run Tests
88
+
89
+ ```bash
90
+ # Run all tests
91
+ go test ./...
92
+
93
+ # Run tests with coverage
94
+ go test -cover ./...
95
+
96
+ # Run specific package tests
97
+ go test ./pkg/fetcher/...
98
+ ```
99
+
100
+ ---
101
+
102
+ ## 💻 Making Changes
103
+
104
+ ### Branch Naming
105
+
106
+ Use descriptive branch names:
107
+ - `fix/content-extraction-bug`
108
+ - `feat/add-pdf-support`
109
+ - `docs/update-readme-examples`
110
+ - `perf/improve-concurrent-fetching`
111
+
112
+ ### Code Style
113
+
114
+ Follow Go best practices:
115
+ - Run `go fmt` before committing
116
+ - Run `go vet` to catch issues
117
+ - Write clear, concise comments
118
+ - Keep functions small and focused
119
+ - Use meaningful variable names
120
+
121
+ ### Testing Requirements
122
+
123
+ - Add tests for new features
124
+ - Ensure existing tests pass
125
+ - Include edge cases
126
+ - Test with real documentation sites
127
+
128
+ Example test:
129
+ ```go
130
+ func TestContentExtraction(t *testing.T) {
131
+ doc := createTestDocument()
132
+ content := cleanContent(doc)
133
+
134
+ if len(content) == 0 {
135
+ t.Error("Expected content to be extracted")
136
+ }
137
+
138
+ if !strings.Contains(content, "expected text") {
139
+ t.Error("Expected content to contain specific text")
140
+ }
141
+ }
142
+ ```
143
+
144
+ ---
145
+
146
+ ## 📤 Submitting a Pull Request
147
+
148
+ ### PR Checklist
149
+
150
+ Before submitting your PR, ensure:
151
+
152
+ - [ ] You created an issue first and referenced it in the PR
153
+ - [ ] Your code follows Go style guidelines
154
+ - [ ] All tests pass (`go test ./...`)
155
+ - [ ] You've added tests for new functionality
156
+ - [ ] You've updated documentation if needed
157
+ - [ ] Your commit messages are clear and descriptive
158
+ - [ ] You've rebased on the latest main branch
159
+
160
+ ### PR Template
161
+
162
+ When creating your PR, include:
163
+
164
+ ```markdown
165
+ ## Description
166
+ Brief description of changes
167
+
168
+ ## Related Issue
169
+ Fixes #123 (or "Related to #123")
170
+
171
+ ## Type of Change
172
+ - [ ] Bug fix
173
+ - [ ] New feature
174
+ - [ ] Breaking change
175
+ - [ ] Documentation update
176
+ - [ ] Performance improvement
177
+ - [ ] Refactoring
178
+
179
+ ## Testing
180
+ Describe how you tested this:
181
+ - [ ] Unit tests added/updated
182
+ - [ ] Manual testing with real docs
183
+ - [ ] Tested on: [list platforms]
184
+
185
+ ## Example Usage
186
+ Show example command and output if applicable
187
+
188
+ ## Checklist
189
+ - [ ] Code follows project guidelines
190
+ - [ ] Self-review completed
191
+ - [ ] Comments added where needed
192
+ - [ ] Tests pass locally
193
+ ```
194
+
195
+ ---
196
+
197
+ ## 🔍 Review Process
198
+
199
+ 1. **Automated Checks**: CI runs tests and linting
200
+ 2. **Maintainer Review**: At least one maintainer reviews
201
+ 3. **Feedback**: You may be asked to make changes
202
+ 4. **Approval**: Once approved, PR is merged
203
+ 5. **Release**: Changes included in next release
204
+
205
+ Typical timeline: 3-7 days for review
206
+
207
+ ---
208
+
209
+ ## 📖 Contribution Ideas
210
+
211
+ Looking for ways to contribute? Here are some ideas:
212
+
213
+ ### Easy Wins
214
+ - Fix typos in documentation
215
+ - Add more examples to README
216
+ - Improve error messages
217
+ - Add unit tests for existing code
218
+
219
+ ### Intermediate
220
+ - Add support for new documentation site formats
221
+ - Improve content extraction selectors
222
+ - Add progress indicators
223
+ - Enhance LLM.txt generation
224
+
225
+ ### Advanced
226
+ - Add PDF export support
227
+ - Implement incremental updates
228
+ - Add authentication support for private docs
229
+ - Create plugin system for custom extractors
230
+
231
+ ---
232
+
233
+ ## 🤝 Community Guidelines
234
+
235
+ ### Be Respectful
236
+ - Treat everyone with respect
237
+ - Welcome newcomers
238
+ - Provide constructive feedback
239
+ - Assume good intentions
240
+
241
+ ### Communication
242
+ - Use clear, concise language
243
+ - Explain your reasoning
244
+ - Ask questions if unsure
245
+ - Respond to feedback promptly
246
+
247
+ ### Collaboration
248
+ - Work with maintainers, not against them
249
+ - Be open to suggestions
250
+ - Help other contributors
251
+ - Share knowledge
252
+
253
+ ---
254
+
255
+ ## 📜 License
256
+
257
+ By contributing to DocFetch, you agree that your contributions will be licensed under the MIT License.
258
+
259
+ ---
260
+
261
+ ## ❓ Questions?
262
+
263
+ - **General questions**: Open a discussion on GitHub
264
+ - **Bug reports**: Create an issue
265
+ - **Feature requests**: Create an issue
266
+ - **Quick questions**: Check existing issues/discussions first
267
+
268
+ ---
269
+
270
+ ## 🙏 Thank You!
271
+
272
+ Your contributions make DocFetch better for everyone. Whether it's a typo fix, a new feature, or better documentation - we appreciate your time and effort!
273
+
274
+ Happy coding! 🚀
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Post-install script for doc-fetch-cli
4
+ * Checks if global bin directory is in PATH and provides helpful instructions
5
+ */
6
+
7
+ const { execSync } = require('child_process');
8
+ const path = require('path');
9
+ const os = require('os');
10
+ const fs = require('fs');
11
+
12
+ console.log('🎉 DocFetch CLI installed successfully!\n');
13
+
14
+ // Get npm global prefix
15
+ let globalPrefix;
16
+ try {
17
+ globalPrefix = execSync('npm config get prefix', { encoding: 'utf8' }).trim();
18
+ } catch (error) {
19
+ console.error('⚠️ Could not determine npm global prefix');
20
+ globalPrefix = null;
21
+ }
22
+
23
+ if (globalPrefix) {
24
+ const binDir = path.join(globalPrefix, 'bin');
25
+ const isWindows = os.platform() === 'win32';
26
+
27
+ console.log(`📦 Installed to: ${binDir}\n`);
28
+
29
+ // Check if bin directory is in PATH
30
+ const pathEnv = process.env.PATH || '';
31
+ const pathDirs = pathEnv.split(isWindows ? ';' : ':');
32
+ const isInPath = pathDirs.some(dir => path.resolve(dir) === path.resolve(binDir));
33
+
34
+ if (!isInPath) {
35
+ console.log('⚠️ WARNING: Global bin directory is not in your PATH!\n');
36
+ console.log('To use doc-fetch-cli, add this directory to your PATH:\n');
37
+ console.log(` ${binDir}\n`);
38
+
39
+ // Provide platform-specific instructions
40
+ const shell = process.env.SHELL || '/bin/bash';
41
+ const isZsh = shell.includes('zsh');
42
+ const isBash = shell.includes('bash');
43
+
44
+ console.log('Quick fix:\n');
45
+
46
+ if (isWindows) {
47
+ console.log('1. Open System Properties → Environment Variables');
48
+ console.log('2. Edit PATH variable');
49
+ console.log('3. Add this path:');
50
+ console.log(` ${binDir}`);
51
+ console.log('4. Restart your terminal\n');
52
+ } else if (isZsh) {
53
+ console.log('Add this to your ~/.zshrc:');
54
+ console.log(` export PATH="${binDir}:$PATH"\n`);
55
+ console.log('Then run: source ~/.zshrc\n');
56
+ } else if (isBash) {
57
+ console.log('Add this to your ~/.bashrc or ~/.bash_profile:');
58
+ console.log(` export PATH="${binDir}:$PATH"\n`);
59
+ console.log('Then run: source ~/.bashrc\n');
60
+ }
61
+
62
+ console.log('Alternative: Use npx without installing globally\n');
63
+ console.log(' npx doc-fetch-cli --url https://docs.example.com --output docs.md\n');
64
+ } else {
65
+ console.log('✅ Global bin directory is in your PATH\n');
66
+ console.log('You can now use doc-fetch-cli!\n');
67
+ console.log('Example usage:');
68
+ console.log(' doc-fetch --url https://docs.python.org/3 --output docs.md --llm-txt\n');
69
+
70
+ // Test if the command works
71
+ try {
72
+ execSync('doc-fetch --version', { encoding: 'utf8', stdio: 'pipe' });
73
+ console.log('✅ Command verified working!\n');
74
+ } catch (error) {
75
+ console.log('⚠️ Command not found in current shell session.\n');
76
+ console.log('Try running: hash -r (to clear command cache)\n');
77
+ console.log('Or restart your terminal.\n');
78
+ }
79
+ }
80
+ }
81
+
82
+ console.log('📚 Documentation: https://github.com/AlphaTechini/doc-fetch\n');
83
+ console.log('✨ Pro tip: Use --llm-txt flag to generate AI-friendly index files!\n');
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-fetch
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
5
5
  Home-page: https://github.com/AlphaTechini/doc-fetch
6
6
  Author: AlphaTechini
@@ -1,7 +1,18 @@
1
+ CONTRIBUTING.md
1
2
  README.md
3
+ SECURITY.md
4
+ doc-fetch
5
+ doc-fetch_darwin_amd64
6
+ doc-fetch_linux_amd64
7
+ doc-fetch_windows_amd64.exe
2
8
  go.mod
9
+ go.sum
10
+ package.json
3
11
  pyproject.toml
4
12
  setup.py
13
+ bin/doc-fetch.js
14
+ bin/install.js
15
+ bin/postinstall.js
5
16
  cmd/docfetch/main.go
6
17
  doc_fetch/__init__.py
7
18
  doc_fetch/__main__.py
@@ -14,6 +25,12 @@ doc_fetch.egg-info/not-zip-safe
14
25
  doc_fetch.egg-info/top_level.txt
15
26
  docs/usage.md
16
27
  examples/golang-example.sh
28
+ pkg/fetcher/classifier.go
29
+ pkg/fetcher/describer.go
30
+ pkg/fetcher/extract_nav.go
17
31
  pkg/fetcher/fetcher.go
32
+ pkg/fetcher/fetcher_optimized.go
18
33
  pkg/fetcher/html2md.go
34
+ pkg/fetcher/llmtxt.go
35
+ pkg/fetcher/validator.go
19
36
  pkg/fetcher/writer.go
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "doc-fetch-cli",
3
- "version": "1.1.0",
3
+ "version": "1.1.1",
4
4
  "description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
5
5
  "bin": {
6
6
  "doc-fetch": "./bin/doc-fetch.js"
7
7
  },
8
8
  "scripts": {
9
- "postinstall": "node ./bin/install.js"
9
+ "postinstall": "node ./bin/postinstall.js"
10
10
  },
11
11
  "repository": {
12
12
  "type": "git",
@@ -0,0 +1,163 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "fmt"
5
+ "strings"
6
+
7
+ "github.com/PuerkitoBio/goquery"
8
+ )
9
+
10
+ // ExtractNavigationStructure extracts nav elements with h2/h3, ul/li, and hrefs
11
+ func ExtractNavigationStructure(doc *goquery.Document) string {
12
+ var result strings.Builder
13
+
14
+ result.WriteString("# Navigation Structure\n\n")
15
+
16
+ // Find all nav elements
17
+ doc.Find("nav").Each(func(i int, nav *goquery.Selection) {
18
+ result.WriteString(fmt.Sprintf("## Navigation Block %d\n\n", i+1))
19
+
20
+ // Look for headings in nav
21
+ nav.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
22
+ tagName := h.Get(0).Data
23
+ text := strings.TrimSpace(h.Text())
24
+ if text != "" {
25
+ result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
26
+ }
27
+
28
+ // Find ul under this heading
29
+ h.NextFiltered("ul").Each(func(k int, ul *goquery.Selection) {
30
+ result.WriteString(extractListWithLinks(ul, 1))
31
+ })
32
+ })
33
+
34
+ // Also find ul directly in nav
35
+ nav.ChildrenFiltered("ul").Each(func(k int, ul *goquery.Selection) {
36
+ result.WriteString(extractListWithLinks(ul, 1))
37
+ })
38
+
39
+ result.WriteString("---\n\n")
40
+ })
41
+
42
+ // Also look for elements with navigation-related classes/ids
43
+ navSelectors := []string{
44
+ "[class*='nav']",
45
+ "[id*='nav']",
46
+ "[class*='menu']",
47
+ "[id*='menu']",
48
+ "[role='navigation']",
49
+ ".toc",
50
+ "#toc",
51
+ "[class*='toc']",
52
+ "[id*='toc']",
53
+ }
54
+
55
+ for _, selector := range navSelectors {
56
+ doc.Find(selector).Each(func(i int, s *goquery.Selection) {
57
+ // Skip if already processed as nav element
58
+ if s.Parent().Is("nav") {
59
+ return
60
+ }
61
+
62
+ result.WriteString(fmt.Sprintf("## Navigation Element (matched: %s)\n\n", selector))
63
+
64
+ // Extract headings
65
+ s.Find("h1, h2, h3, h4, h5, h6").Each(func(j int, h *goquery.Selection) {
66
+ tagName := h.Get(0).Data
67
+ text := strings.TrimSpace(h.Text())
68
+ if text != "" {
69
+ result.WriteString(fmt.Sprintf("### %s: %s\n\n", tagName, text))
70
+ }
71
+ })
72
+
73
+ // Extract lists with links
74
+ s.Find("ul, ol").Each(func(k int, list *goquery.Selection) {
75
+ result.WriteString(extractListWithLinks(list, 1))
76
+ })
77
+
78
+ result.WriteString("---\n\n")
79
+ })
80
+ }
81
+
82
+ return result.String()
83
+ }
84
+
85
+ // extractListWithLinks extracts list items with their href attributes
86
+ func extractListWithLinks(list *goquery.Selection, indentLevel int) string {
87
+ var result strings.Builder
88
+
89
+ indent := strings.Repeat(" ", indentLevel)
90
+
91
+ list.Find("> li").Each(func(i int, li *goquery.Selection) {
92
+ // Get the text
93
+ text := strings.TrimSpace(li.Text())
94
+
95
+ // Find any links in this li
96
+ li.Find("a[href]").Each(func(j int, a *goquery.Selection) {
97
+ href, exists := a.Attr("href")
98
+ linkText := strings.TrimSpace(a.Text())
99
+ if exists && href != "" {
100
+ result.WriteString(fmt.Sprintf("%s- [%s](%s)\n", indent, linkText, href))
101
+ }
102
+ })
103
+
104
+ // If no links found, just add the text
105
+ if li.Find("a[href]").Length() == 0 && text != "" {
106
+ result.WriteString(fmt.Sprintf("%s- %s\n", indent, text))
107
+ }
108
+
109
+ // Recursively process nested lists
110
+ li.ChildrenFiltered("ul, ol").Each(func(k int, nested *goquery.Selection) {
111
+ result.WriteString(extractListWithLinks(nested, indentLevel+1))
112
+ })
113
+ })
114
+
115
+ return result.String()
116
+ }
117
+
118
+ // ExtractAllLinks extracts all links from the page with context
119
+ func ExtractAllLinks(doc *goquery.Document, baseURL string) string {
120
+ var result strings.Builder
121
+
122
+ result.WriteString("# All Links Found\n\n")
123
+
124
+ linksFound := 0
125
+
126
+ // Group links by section
127
+ doc.Find("section, article, div[class*='content'], div[id*='content']").Each(func(i int, section *goquery.Selection) {
128
+ sectionLinks := 0
129
+ var sectionResult strings.Builder
130
+
131
+ // Get section title
132
+ title := ""
133
+ section.Find("h1, h2, h3").First().Each(func(j int, h *goquery.Selection) {
134
+ title = strings.TrimSpace(h.Text())
135
+ })
136
+
137
+ if title == "" {
138
+ title = fmt.Sprintf("Section %d", i+1)
139
+ }
140
+
141
+ sectionResult.WriteString(fmt.Sprintf("## %s\n\n", title))
142
+
143
+ // Find all links in this section
144
+ section.Find("a[href]").Each(func(j int, a *goquery.Selection) {
145
+ href, exists := a.Attr("href")
146
+ text := strings.TrimSpace(a.Text())
147
+ if exists && href != "" && text != "" {
148
+ sectionResult.WriteString(fmt.Sprintf("- [%s](%s)\n", text, href))
149
+ sectionLinks++
150
+ linksFound++
151
+ }
152
+ })
153
+
154
+ if sectionLinks > 0 {
155
+ result.WriteString(sectionResult.String())
156
+ result.WriteString("\n")
157
+ }
158
+ })
159
+
160
+ result.WriteString(fmt.Sprintf("\n**Total links found: %d**\n", linksFound))
161
+
162
+ return result.String()
163
+ }
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "doc-fetch"
7
- version = "1.1.0"
7
+ version = "1.1.1"
8
8
  description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
9
9
  readme = "README.md"
10
10
  authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
package/setup.py CHANGED
@@ -118,7 +118,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
118
118
 
119
119
  setup(
120
120
  name="doc-fetch",
121
- version="1.1.0",
121
+ version="1.1.1",
122
122
  author="AlphaTechini",
123
123
  author_email="rehobothokoibu@gmail.com",
124
124
  description="Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
Binary file