doc-fetch-cli 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "fmt"
5
+ "log"
6
+ "net"
7
+ "net/http"
8
+ "net/url"
9
+ "strings"
10
+ "sync"
11
+ "time"
12
+
13
+ "github.com/PuerkitoBio/goquery"
14
+ "golang.org/x/net/html"
15
+ )
16
+
17
+ // Config holds the configuration for the documentation fetcher
18
+ type Config struct {
19
+ BaseURL string
20
+ OutputPath string
21
+ MaxDepth int
22
+ Workers int
23
+ UserAgent string
24
+ GenerateLLMTxt bool
25
+ }
26
+
27
+ // Page represents a fetched documentation page
28
+ type Page struct {
29
+ URL string
30
+ Title string
31
+ Content string
32
+ Links []string
33
+ }
34
+
35
+ // LLMTxtEntry represents an entry in the llm.txt file
36
+ type LLMTxtEntry struct {
37
+ Type string
38
+ Title string
39
+ URL string
40
+ Description string
41
+ }
42
+
43
+ // Run executes the documentation fetching process
44
+ func Run(config Config) error {
45
+ // Validate configuration
46
+ if err := validateConfig(&config); err != nil {
47
+ return fmt.Errorf("invalid configuration: %w", err)
48
+ }
49
+
50
+ log.Printf("Starting documentation fetch from: %s", config.BaseURL)
51
+
52
+ // Create a visited map to avoid duplicate fetching
53
+ visited := make(map[string]bool)
54
+ var mutex sync.Mutex
55
+
56
+ // Create channel for pages and results
57
+ pagesChan := make(chan *Page, config.Workers*2)
58
+ resultsChan := make(chan string, config.Workers*2)
59
+ var llmEntries []LLMTxtEntry
60
+
61
+ // Start worker goroutines
62
+ var wg sync.WaitGroup
63
+ for i := 0; i < config.Workers; i++ {
64
+ wg.Add(1)
65
+ go func() {
66
+ defer wg.Done()
67
+ worker(config, pagesChan, resultsChan, &mutex, visited, &llmEntries)
68
+ }()
69
+ }
70
+
71
+ // Start the initial fetch
72
+ pagesChan <- &Page{URL: config.BaseURL, Title: "Root"}
73
+
74
+ // Close pages channel when all workers are done
75
+ go func() {
76
+ wg.Wait()
77
+ close(pagesChan)
78
+ close(resultsChan)
79
+ }()
80
+
81
+ // Collect results and write to file
82
+ err := writeResults(config.OutputPath, resultsChan)
83
+ if err != nil {
84
+ return err
85
+ }
86
+
87
+ // Generate LLM.txt if requested
88
+ if config.GenerateLLMTxt {
89
+ llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
90
+ err = GenerateLLMTxt(llmEntries, llmTxtPath)
91
+ if err != nil {
92
+ log.Printf("Warning: Failed to generate llm.txt: %v", err)
93
+ } else {
94
+ log.Printf("LLM.txt generated: %s", llmTxtPath)
95
+ }
96
+ }
97
+
98
+ return nil
99
+ }
100
+
101
+ // worker processes pages from the channel
102
+ func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mutex *sync.Mutex, visited map[string]bool, llmEntries *[]LLMTxtEntry) {
103
+ client := &http.Client{
104
+ Timeout: 30 * time.Second,
105
+ // Add transport with security restrictions
106
+ Transport: &http.Transport{
107
+ DisableKeepAlives: true,
108
+ },
109
+ }
110
+
111
+ for page := range pagesChan {
112
+ // Validate URL before fetching
113
+ if err := isValidURL(page.URL); err != nil {
114
+ log.Printf("Skipping invalid URL %s: %v", page.URL, err)
115
+ continue
116
+ }
117
+
118
+ mutex.Lock()
119
+ if visited[page.URL] {
120
+ mutex.Unlock()
121
+ continue
122
+ }
123
+ visited[page.URL] = true
124
+ mutex.Unlock()
125
+
126
+ log.Printf("Fetching: %s", page.URL)
127
+
128
+ // Rate limiting - be respectful to servers
129
+ time.Sleep(100 * time.Millisecond)
130
+
131
+ // Fetch the page
132
+ req, err := http.NewRequest("GET", page.URL, nil)
133
+ if err != nil {
134
+ log.Printf("Error creating request for %s: %v", page.URL, err)
135
+ continue
136
+ }
137
+ req.Header.Set("User-Agent", config.UserAgent)
138
+
139
+ resp, err := client.Do(req)
140
+ if err != nil {
141
+ log.Printf("Error fetching %s: %v", page.URL, err)
142
+ continue
143
+ }
144
+ defer resp.Body.Close()
145
+
146
+ if resp.StatusCode != 200 {
147
+ log.Printf("Non-200 status code %d for %s", resp.StatusCode, page.URL)
148
+ continue
149
+ }
150
+
151
+ // Parse HTML
152
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
153
+ if err != nil {
154
+ log.Printf("Error parsing HTML for %s: %v", page.URL, err)
155
+ continue
156
+ }
157
+
158
+ // Extract title
159
+ title := doc.Find("title").Text()
160
+ if title == "" {
161
+ title = page.URL
162
+ }
163
+
164
+ // Clean and extract content
165
+ content := cleanContent(doc)
166
+ if content == "" {
167
+ log.Printf("No content found for %s", page.URL)
168
+ continue
169
+ }
170
+
171
+ // Send result to output
172
+ resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
173
+
174
+ // Generate LLM.txt entry if requested
175
+ if config.GenerateLLMTxt {
176
+ cleanTitle := CleanTitle(title)
177
+ entryType := ClassifyPage(page.URL, cleanTitle)
178
+ description := ExtractDescription(content)
179
+
180
+ entry := LLMTxtEntry{
181
+ Type: entryType,
182
+ Title: cleanTitle,
183
+ URL: page.URL,
184
+ Description: description,
185
+ }
186
+
187
+ mutex.Lock()
188
+ *llmEntries = append(*llmEntries, entry)
189
+ mutex.Unlock()
190
+ }
191
+
192
+ // Extract links for further crawling (limited depth logic would go here)
193
+ // For MVP, we'll just fetch the main page
194
+ // Future: implement link extraction and recursive crawling
195
+ }
196
+ }
197
+
198
+ // cleanContent extracts and cleans the main documentation content
199
+ func cleanContent(doc *goquery.Document) string {
200
+ // Common selectors for documentation content
201
+ selectors := []string{
202
+ "main",
203
+ "article",
204
+ ".content",
205
+ ".docs-content",
206
+ "#main-content",
207
+ ".documentation",
208
+ ".post-content",
209
+ ".markdown-body",
210
+ ".content-wrapper",
211
+ ".doc-content",
212
+ }
213
+
214
+ // Try each selector
215
+ for _, selector := range selectors {
216
+ if el := doc.Find(selector); el.Length() > 0 {
217
+ // Remove unwanted elements
218
+ el.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
219
+
220
+ // Convert to HTML and then clean
221
+ htmlContent, err := el.Html()
222
+ if err != nil {
223
+ continue
224
+ }
225
+
226
+ // Basic HTML cleaning
227
+ cleaned := cleanHTML(htmlContent)
228
+ if cleaned != "" {
229
+ return cleaned
230
+ }
231
+ }
232
+ }
233
+
234
+ // Fallback: try to get body content
235
+ body := doc.Find("body")
236
+ if body.Length() > 0 {
237
+ body.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
238
+ htmlContent, _ := body.Html()
239
+ return cleanHTML(htmlContent)
240
+ }
241
+
242
+ return ""
243
+ }
244
+
245
+ // cleanHTML performs basic HTML cleaning
246
+ func cleanHTML(htmlStr string) string {
247
+ // Parse and extract text content while preserving structure
248
+ doc, err := html.Parse(strings.NewReader(htmlStr))
249
+ if err != nil {
250
+ return ""
251
+ }
252
+
253
+ var texts []string
254
+ var extractText func(*html.Node)
255
+ extractText = func(n *html.Node) {
256
+ if n.Type == html.TextNode {
257
+ text := strings.TrimSpace(n.Data)
258
+ if text != "" {
259
+ texts = append(texts, text)
260
+ }
261
+ }
262
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
263
+ extractText(c)
264
+ }
265
+ }
266
+
267
+ extractText(doc)
268
+ return strings.Join(texts, "\n")
269
+ }
270
+
271
+ // isValidURL validates that a URL is safe to fetch
272
+ func isValidURL(urlStr string) error {
273
+ parsed, err := url.Parse(urlStr)
274
+ if err != nil {
275
+ return fmt.Errorf("invalid URL format: %w", err)
276
+ }
277
+
278
+ // Only allow HTTP/HTTPS
279
+ if parsed.Scheme != "http" && parsed.Scheme != "https" {
280
+ return fmt.Errorf("only HTTP/HTTPS URLs allowed")
281
+ }
282
+
283
+ // Block private IP ranges
284
+ host := parsed.Hostname()
285
+ ip := net.ParseIP(host)
286
+ if ip != nil {
287
+ if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
288
+ return fmt.Errorf("private/internal IP addresses not allowed")
289
+ }
290
+ }
291
+
292
+ // Block dangerous hostnames
293
+ dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
294
+ for _, dangerous := range dangerousHosts {
295
+ if host == dangerous {
296
+ return fmt.Errorf("local hostnames not allowed")
297
+ }
298
+ }
299
+
300
+ return nil
301
+ }
302
+
303
+ // validateConfig validates the entire configuration
304
+ func validateConfig(config *Config) error {
305
+ if err := validateOutputPath(config.OutputPath); err != nil {
306
+ return fmt.Errorf("output path validation failed: %w", err)
307
+ }
308
+
309
+ if err := isValidURL(config.BaseURL); err != nil {
310
+ return fmt.Errorf("base URL validation failed: %w", err)
311
+ }
312
+
313
+ // Limit depth to prevent excessive crawling
314
+ if config.MaxDepth > 10 {
315
+ return fmt.Errorf("max depth cannot exceed 10")
316
+ }
317
+
318
+ // Limit workers to prevent resource exhaustion
319
+ if config.Workers > 20 {
320
+ return fmt.Errorf("concurrent workers cannot exceed 20")
321
+ }
322
+
323
+ // Ensure reasonable timeout values
324
+ if config.Workers <= 0 {
325
+ config.Workers = 3 // Default
326
+ }
327
+ if config.MaxDepth <= 0 {
328
+ config.MaxDepth = 2 // Default
329
+ }
330
+
331
+ return nil
332
+ }
@@ -0,0 +1,71 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "strings"
5
+ )
6
+
7
+ // ConvertHTMLToMarkdown converts HTML content to clean markdown
8
+ func ConvertHTMLToMarkdown(htmlContent string) string {
9
+ if htmlContent == "" {
10
+ return ""
11
+ }
12
+
13
+ // Basic HTML to markdown conversion
14
+ markdownContent := basicHTMLToMarkdown(htmlContent)
15
+
16
+ return strings.TrimSpace(markdownContent)
17
+ }
18
+
19
+ // basicHTMLToMarkdown provides basic HTML to markdown conversion
20
+ func basicHTMLToMarkdown(html string) string {
21
+ // Replace common HTML tags with markdown equivalents
22
+ replacements := map[string]string{
23
+ "<h1>": "# ",
24
+ "</h1>": "\n\n",
25
+ "<h2>": "## ",
26
+ "</h2>": "\n\n",
27
+ "<h3>": "### ",
28
+ "</h3>": "\n\n",
29
+ "<h4>": "#### ",
30
+ "</h4>": "\n\n",
31
+ "<h5>": "##### ",
32
+ "</h5>": "\n\n",
33
+ "<h6>": "###### ",
34
+ "</h6>": "\n\n",
35
+ "<p>": "",
36
+ "</p>": "\n\n",
37
+ "<br>": "\n",
38
+ "<br/>": "\n",
39
+ "<strong>": "**",
40
+ "</strong>": "**",
41
+ "<b>": "**",
42
+ "</b>": "**",
43
+ "<em>": "*",
44
+ "</em>": "*",
45
+ "<i>": "*",
46
+ "</i>": "*",
47
+ "<code>": "`",
48
+ "</code>": "`",
49
+ "<pre>": "```",
50
+ "</pre>": "```",
51
+ "<ul>": "",
52
+ "</ul>": "\n",
53
+ "<ol>": "",
54
+ "</ol>": "\n",
55
+ "<li>": "- ",
56
+ "</li>": "\n",
57
+ "<blockquote>": "> ",
58
+ "</blockquote>": "\n\n",
59
+ }
60
+
61
+ result := html
62
+ for htmlTag, mdReplacement := range replacements {
63
+ result = strings.ReplaceAll(result, htmlTag, mdReplacement)
64
+ }
65
+
66
+ // Clean up extra whitespace
67
+ result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
68
+ result = strings.TrimSpace(result)
69
+
70
+ return result
71
+ }
@@ -0,0 +1,36 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "bufio"
5
+ "fmt"
6
+ "os"
7
+ "strings"
8
+ )
9
+
10
+ // GenerateLLMTxt creates an llm.txt file with AI-friendly documentation index
11
+ func GenerateLLMTxt(entries []LLMTxtEntry, outputPath string) error {
12
+ file, err := os.Create(outputPath)
13
+ if err != nil {
14
+ return fmt.Errorf("failed to create llm.txt file: %w", err)
15
+ }
16
+ defer file.Close()
17
+
18
+ writer := bufio.NewWriter(file)
19
+ defer writer.Flush()
20
+
21
+ // Write header
22
+ writer.WriteString("# llm.txt - AI-friendly documentation index\n")
23
+ writer.WriteString("# This file helps LLMs quickly find relevant documentation sections\n\n")
24
+
25
+ for _, entry := range entries {
26
+ // Write entry in the format: [TYPE] Title
27
+ writer.WriteString(fmt.Sprintf("[%s] %s\n",
28
+ strings.ToUpper(entry.Type), entry.Title))
29
+ // Write URL
30
+ writer.WriteString(entry.URL + "\n")
31
+ // Write description
32
+ writer.WriteString(entry.Description + "\n\n")
33
+ }
34
+
35
+ return nil
36
+ }
@@ -0,0 +1,109 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "fmt"
5
+ "net"
6
+ "net/url"
7
+ "path/filepath"
8
+ "strings"
9
+ )
10
+
11
+ // ValidateConfig validates the configuration for security issues
12
+ func ValidateConfig(config *Config) error {
13
+ if err := validateURL(config.BaseURL); err != nil {
14
+ return fmt.Errorf("invalid URL: %w", err)
15
+ }
16
+ if err := validateOutputPath(config.OutputPath); err != nil {
17
+ return fmt.Errorf("invalid output path: %w", err)
18
+ }
19
+ if config.MaxDepth > 10 {
20
+ return fmt.Errorf("max depth too high (maximum allowed: 10)")
21
+ }
22
+ if config.Workers > 20 {
23
+ return fmt.Errorf("too many concurrent workers (maximum allowed: 20)")
24
+ }
25
+ return nil
26
+ }
27
+
28
+ // validateURL checks if the URL is safe to fetch
29
+ func validateURL(urlStr string) error {
30
+ parsed, err := url.Parse(urlStr)
31
+ if err != nil {
32
+ return err
33
+ }
34
+
35
+ // Only allow HTTP and HTTPS
36
+ if parsed.Scheme != "http" && parsed.Scheme != "https" {
37
+ return fmt.Errorf("only HTTP and HTTPS URLs are allowed")
38
+ }
39
+
40
+ // Block private IP ranges and localhost
41
+ host := parsed.Hostname()
42
+ ip := net.ParseIP(host)
43
+ if ip != nil {
44
+ if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
45
+ return fmt.Errorf("access to private/internal IP addresses is not allowed")
46
+ }
47
+ }
48
+
49
+ // Block dangerous hostnames
50
+ dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
51
+ for _, dangerous := range dangerousHosts {
52
+ if strings.ToLower(host) == dangerous {
53
+ return fmt.Errorf("access to localhost is not allowed")
54
+ }
55
+ }
56
+
57
+ return nil
58
+ }
59
+
60
+ // validateOutputPath ensures the output path is safe
61
+ func validateOutputPath(path string) error {
62
+ // Don't allow absolute paths that start with /
63
+ if strings.HasPrefix(path, "/") {
64
+ return fmt.Errorf("absolute paths are not allowed")
65
+ }
66
+
67
+ // Don't allow paths that contain ..
68
+ if strings.Contains(path, "..") {
69
+ return fmt.Errorf("relative path traversal (..) is not allowed")
70
+ }
71
+
72
+ // Don't allow paths that contain ~
73
+ if strings.Contains(path, "~") {
74
+ return fmt.Errorf("home directory expansion (~) is not allowed")
75
+ }
76
+
77
+ // Resolve to absolute path to check final destination
78
+ absPath, err := filepath.Abs(path)
79
+ if err != nil {
80
+ return err
81
+ }
82
+
83
+ // Get current working directory
84
+ cwd, err := filepath.Abs(".")
85
+ if err != nil {
86
+ return err
87
+ }
88
+
89
+ // Ensure the absolute path is within the current working directory
90
+ if !strings.HasPrefix(absPath, cwd) {
91
+ return fmt.Errorf("output path must be within the current working directory")
92
+ }
93
+
94
+ // Check file extension - only allow safe extensions
95
+ allowedExtensions := []string{".md", ".txt", ".llm.txt"}
96
+ ext := filepath.Ext(path)
97
+ isAllowed := false
98
+ for _, allowed := range allowedExtensions {
99
+ if ext == allowed {
100
+ isAllowed = true
101
+ break
102
+ }
103
+ }
104
+ if !isAllowed {
105
+ return fmt.Errorf("only .md, .txt, and .llm.txt file extensions are allowed")
106
+ }
107
+
108
+ return nil
109
+ }
@@ -0,0 +1,32 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "bufio"
5
+ "os"
6
+ "strings"
7
+ )
8
+
9
+ // writeResults writes the fetched documentation to the output file
10
+ func writeResults(outputPath string, resultsChan <-chan string) error {
11
+ file, err := os.Create(outputPath)
12
+ if err != nil {
13
+ return err
14
+ }
15
+ defer file.Close()
16
+
17
+ writer := bufio.NewWriter(file)
18
+ defer writer.Flush()
19
+
20
+ // Write header
21
+ header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
22
+ writer.WriteString(header)
23
+
24
+ // Write all results
25
+ for result := range resultsChan {
26
+ if strings.TrimSpace(result) != "" {
27
+ writer.WriteString(result)
28
+ }
29
+ }
30
+
31
+ return nil
32
+ }
package/pyproject.toml ADDED
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "doc-fetch"
7
+ version = "1.0.1"
8
+ description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
9
+ readme = "README.md"
10
+ authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
11
+ license = {text = "MIT"}
12
+ classifiers = [
13
+ "Development Status :: 5 - Production/Stable",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.8",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Documentation",
24
+ "Topic :: Software Development :: Documentation",
25
+ "Topic :: Utilities",
26
+ ]
27
+ keywords = ["documentation", "ai", "llm", "markdown", "crawler", "security"]
28
+ requires-python = ">=3.8"
29
+ dependencies = []
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/AlphaTechini/doc-fetch"
33
+ Repository = "https://github.com/AlphaTechini/doc-fetch"
34
+ Documentation = "https://github.com/AlphaTechini/doc-fetch#readme"
35
+
36
+ [project.scripts]
37
+ doc-fetch = "doc_fetch.cli:main"