doc-fetch-cli 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -0
- package/SECURITY.md +84 -0
- package/bin/doc-fetch.js +37 -0
- package/bin/install.js +171 -0
- package/cmd/docfetch/main.go +54 -0
- package/dist/doc_fetch-1.0.1-py3-none-any.whl +0 -0
- package/dist/doc_fetch-1.0.1.tar.gz +0 -0
- package/doc-fetch +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_linux_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/doc_fetch/__init__.py +6 -0
- package/doc_fetch/__main__.py +7 -0
- package/doc_fetch/cli.py +113 -0
- package/doc_fetch.egg-info/PKG-INFO +224 -0
- package/doc_fetch.egg-info/SOURCES.txt +19 -0
- package/doc_fetch.egg-info/dependency_links.txt +1 -0
- package/doc_fetch.egg-info/entry_points.txt +2 -0
- package/doc_fetch.egg-info/not-zip-safe +1 -0
- package/doc_fetch.egg-info/top_level.txt +1 -0
- package/docs/usage.md +67 -0
- package/examples/golang-example.sh +12 -0
- package/go.mod +11 -0
- package/go.sum +38 -0
- package/package.json +18 -0
- package/pkg/fetcher/classifier.go +50 -0
- package/pkg/fetcher/describer.go +61 -0
- package/pkg/fetcher/fetcher.go +332 -0
- package/pkg/fetcher/html2md.go +71 -0
- package/pkg/fetcher/llmtxt.go +36 -0
- package/pkg/fetcher/validator.go +109 -0
- package/pkg/fetcher/writer.go +32 -0
- package/pyproject.toml +37 -0
- package/setup.py +158 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"log"
|
|
6
|
+
"net"
|
|
7
|
+
"net/http"
|
|
8
|
+
"net/url"
|
|
9
|
+
"strings"
|
|
10
|
+
"sync"
|
|
11
|
+
"time"
|
|
12
|
+
|
|
13
|
+
"github.com/PuerkitoBio/goquery"
|
|
14
|
+
"golang.org/x/net/html"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
// Config holds the configuration for the documentation fetcher
|
|
18
|
+
type Config struct {
|
|
19
|
+
BaseURL string
|
|
20
|
+
OutputPath string
|
|
21
|
+
MaxDepth int
|
|
22
|
+
Workers int
|
|
23
|
+
UserAgent string
|
|
24
|
+
GenerateLLMTxt bool
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Page represents a fetched documentation page
|
|
28
|
+
type Page struct {
|
|
29
|
+
URL string
|
|
30
|
+
Title string
|
|
31
|
+
Content string
|
|
32
|
+
Links []string
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// LLMTxtEntry represents an entry in the llm.txt file
|
|
36
|
+
type LLMTxtEntry struct {
|
|
37
|
+
Type string
|
|
38
|
+
Title string
|
|
39
|
+
URL string
|
|
40
|
+
Description string
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Run executes the documentation fetching process
|
|
44
|
+
func Run(config Config) error {
|
|
45
|
+
// Validate configuration
|
|
46
|
+
if err := validateConfig(&config); err != nil {
|
|
47
|
+
return fmt.Errorf("invalid configuration: %w", err)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
log.Printf("Starting documentation fetch from: %s", config.BaseURL)
|
|
51
|
+
|
|
52
|
+
// Create a visited map to avoid duplicate fetching
|
|
53
|
+
visited := make(map[string]bool)
|
|
54
|
+
var mutex sync.Mutex
|
|
55
|
+
|
|
56
|
+
// Create channel for pages and results
|
|
57
|
+
pagesChan := make(chan *Page, config.Workers*2)
|
|
58
|
+
resultsChan := make(chan string, config.Workers*2)
|
|
59
|
+
var llmEntries []LLMTxtEntry
|
|
60
|
+
|
|
61
|
+
// Start worker goroutines
|
|
62
|
+
var wg sync.WaitGroup
|
|
63
|
+
for i := 0; i < config.Workers; i++ {
|
|
64
|
+
wg.Add(1)
|
|
65
|
+
go func() {
|
|
66
|
+
defer wg.Done()
|
|
67
|
+
worker(config, pagesChan, resultsChan, &mutex, visited, &llmEntries)
|
|
68
|
+
}()
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Start the initial fetch
|
|
72
|
+
pagesChan <- &Page{URL: config.BaseURL, Title: "Root"}
|
|
73
|
+
|
|
74
|
+
// Close pages channel when all workers are done
|
|
75
|
+
go func() {
|
|
76
|
+
wg.Wait()
|
|
77
|
+
close(pagesChan)
|
|
78
|
+
close(resultsChan)
|
|
79
|
+
}()
|
|
80
|
+
|
|
81
|
+
// Collect results and write to file
|
|
82
|
+
err := writeResults(config.OutputPath, resultsChan)
|
|
83
|
+
if err != nil {
|
|
84
|
+
return err
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Generate LLM.txt if requested
|
|
88
|
+
if config.GenerateLLMTxt {
|
|
89
|
+
llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
|
|
90
|
+
err = GenerateLLMTxt(llmEntries, llmTxtPath)
|
|
91
|
+
if err != nil {
|
|
92
|
+
log.Printf("Warning: Failed to generate llm.txt: %v", err)
|
|
93
|
+
} else {
|
|
94
|
+
log.Printf("LLM.txt generated: %s", llmTxtPath)
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return nil
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// worker processes pages from the channel
|
|
102
|
+
func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mutex *sync.Mutex, visited map[string]bool, llmEntries *[]LLMTxtEntry) {
|
|
103
|
+
client := &http.Client{
|
|
104
|
+
Timeout: 30 * time.Second,
|
|
105
|
+
// Add transport with security restrictions
|
|
106
|
+
Transport: &http.Transport{
|
|
107
|
+
DisableKeepAlives: true,
|
|
108
|
+
},
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
for page := range pagesChan {
|
|
112
|
+
// Validate URL before fetching
|
|
113
|
+
if err := isValidURL(page.URL); err != nil {
|
|
114
|
+
log.Printf("Skipping invalid URL %s: %v", page.URL, err)
|
|
115
|
+
continue
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
mutex.Lock()
|
|
119
|
+
if visited[page.URL] {
|
|
120
|
+
mutex.Unlock()
|
|
121
|
+
continue
|
|
122
|
+
}
|
|
123
|
+
visited[page.URL] = true
|
|
124
|
+
mutex.Unlock()
|
|
125
|
+
|
|
126
|
+
log.Printf("Fetching: %s", page.URL)
|
|
127
|
+
|
|
128
|
+
// Rate limiting - be respectful to servers
|
|
129
|
+
time.Sleep(100 * time.Millisecond)
|
|
130
|
+
|
|
131
|
+
// Fetch the page
|
|
132
|
+
req, err := http.NewRequest("GET", page.URL, nil)
|
|
133
|
+
if err != nil {
|
|
134
|
+
log.Printf("Error creating request for %s: %v", page.URL, err)
|
|
135
|
+
continue
|
|
136
|
+
}
|
|
137
|
+
req.Header.Set("User-Agent", config.UserAgent)
|
|
138
|
+
|
|
139
|
+
resp, err := client.Do(req)
|
|
140
|
+
if err != nil {
|
|
141
|
+
log.Printf("Error fetching %s: %v", page.URL, err)
|
|
142
|
+
continue
|
|
143
|
+
}
|
|
144
|
+
defer resp.Body.Close()
|
|
145
|
+
|
|
146
|
+
if resp.StatusCode != 200 {
|
|
147
|
+
log.Printf("Non-200 status code %d for %s", resp.StatusCode, page.URL)
|
|
148
|
+
continue
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Parse HTML
|
|
152
|
+
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
153
|
+
if err != nil {
|
|
154
|
+
log.Printf("Error parsing HTML for %s: %v", page.URL, err)
|
|
155
|
+
continue
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Extract title
|
|
159
|
+
title := doc.Find("title").Text()
|
|
160
|
+
if title == "" {
|
|
161
|
+
title = page.URL
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Clean and extract content
|
|
165
|
+
content := cleanContent(doc)
|
|
166
|
+
if content == "" {
|
|
167
|
+
log.Printf("No content found for %s", page.URL)
|
|
168
|
+
continue
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Send result to output
|
|
172
|
+
resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
|
|
173
|
+
|
|
174
|
+
// Generate LLM.txt entry if requested
|
|
175
|
+
if config.GenerateLLMTxt {
|
|
176
|
+
cleanTitle := CleanTitle(title)
|
|
177
|
+
entryType := ClassifyPage(page.URL, cleanTitle)
|
|
178
|
+
description := ExtractDescription(content)
|
|
179
|
+
|
|
180
|
+
entry := LLMTxtEntry{
|
|
181
|
+
Type: entryType,
|
|
182
|
+
Title: cleanTitle,
|
|
183
|
+
URL: page.URL,
|
|
184
|
+
Description: description,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
mutex.Lock()
|
|
188
|
+
*llmEntries = append(*llmEntries, entry)
|
|
189
|
+
mutex.Unlock()
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Extract links for further crawling (limited depth logic would go here)
|
|
193
|
+
// For MVP, we'll just fetch the main page
|
|
194
|
+
// Future: implement link extraction and recursive crawling
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// cleanContent extracts and cleans the main documentation content
|
|
199
|
+
func cleanContent(doc *goquery.Document) string {
|
|
200
|
+
// Common selectors for documentation content
|
|
201
|
+
selectors := []string{
|
|
202
|
+
"main",
|
|
203
|
+
"article",
|
|
204
|
+
".content",
|
|
205
|
+
".docs-content",
|
|
206
|
+
"#main-content",
|
|
207
|
+
".documentation",
|
|
208
|
+
".post-content",
|
|
209
|
+
".markdown-body",
|
|
210
|
+
".content-wrapper",
|
|
211
|
+
".doc-content",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Try each selector
|
|
215
|
+
for _, selector := range selectors {
|
|
216
|
+
if el := doc.Find(selector); el.Length() > 0 {
|
|
217
|
+
// Remove unwanted elements
|
|
218
|
+
el.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
|
|
219
|
+
|
|
220
|
+
// Convert to HTML and then clean
|
|
221
|
+
htmlContent, err := el.Html()
|
|
222
|
+
if err != nil {
|
|
223
|
+
continue
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Basic HTML cleaning
|
|
227
|
+
cleaned := cleanHTML(htmlContent)
|
|
228
|
+
if cleaned != "" {
|
|
229
|
+
return cleaned
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Fallback: try to get body content
|
|
235
|
+
body := doc.Find("body")
|
|
236
|
+
if body.Length() > 0 {
|
|
237
|
+
body.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
|
|
238
|
+
htmlContent, _ := body.Html()
|
|
239
|
+
return cleanHTML(htmlContent)
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return ""
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// cleanHTML performs basic HTML cleaning
|
|
246
|
+
func cleanHTML(htmlStr string) string {
|
|
247
|
+
// Parse and extract text content while preserving structure
|
|
248
|
+
doc, err := html.Parse(strings.NewReader(htmlStr))
|
|
249
|
+
if err != nil {
|
|
250
|
+
return ""
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
var texts []string
|
|
254
|
+
var extractText func(*html.Node)
|
|
255
|
+
extractText = func(n *html.Node) {
|
|
256
|
+
if n.Type == html.TextNode {
|
|
257
|
+
text := strings.TrimSpace(n.Data)
|
|
258
|
+
if text != "" {
|
|
259
|
+
texts = append(texts, text)
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
263
|
+
extractText(c)
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
extractText(doc)
|
|
268
|
+
return strings.Join(texts, "\n")
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// isValidURL validates that a URL is safe to fetch
|
|
272
|
+
func isValidURL(urlStr string) error {
|
|
273
|
+
parsed, err := url.Parse(urlStr)
|
|
274
|
+
if err != nil {
|
|
275
|
+
return fmt.Errorf("invalid URL format: %w", err)
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Only allow HTTP/HTTPS
|
|
279
|
+
if parsed.Scheme != "http" && parsed.Scheme != "https" {
|
|
280
|
+
return fmt.Errorf("only HTTP/HTTPS URLs allowed")
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Block private IP ranges
|
|
284
|
+
host := parsed.Hostname()
|
|
285
|
+
ip := net.ParseIP(host)
|
|
286
|
+
if ip != nil {
|
|
287
|
+
if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
|
|
288
|
+
return fmt.Errorf("private/internal IP addresses not allowed")
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Block dangerous hostnames
|
|
293
|
+
dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
|
|
294
|
+
for _, dangerous := range dangerousHosts {
|
|
295
|
+
if host == dangerous {
|
|
296
|
+
return fmt.Errorf("local hostnames not allowed")
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return nil
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// validateConfig validates the entire configuration
|
|
304
|
+
func validateConfig(config *Config) error {
|
|
305
|
+
if err := validateOutputPath(config.OutputPath); err != nil {
|
|
306
|
+
return fmt.Errorf("output path validation failed: %w", err)
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if err := isValidURL(config.BaseURL); err != nil {
|
|
310
|
+
return fmt.Errorf("base URL validation failed: %w", err)
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Limit depth to prevent excessive crawling
|
|
314
|
+
if config.MaxDepth > 10 {
|
|
315
|
+
return fmt.Errorf("max depth cannot exceed 10")
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Limit workers to prevent resource exhaustion
|
|
319
|
+
if config.Workers > 20 {
|
|
320
|
+
return fmt.Errorf("concurrent workers cannot exceed 20")
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Ensure reasonable timeout values
|
|
324
|
+
if config.Workers <= 0 {
|
|
325
|
+
config.Workers = 3 // Default
|
|
326
|
+
}
|
|
327
|
+
if config.MaxDepth <= 0 {
|
|
328
|
+
config.MaxDepth = 2 // Default
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return nil
|
|
332
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"strings"
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
// ConvertHTMLToMarkdown converts HTML content to clean markdown
|
|
8
|
+
func ConvertHTMLToMarkdown(htmlContent string) string {
|
|
9
|
+
if htmlContent == "" {
|
|
10
|
+
return ""
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Basic HTML to markdown conversion
|
|
14
|
+
markdownContent := basicHTMLToMarkdown(htmlContent)
|
|
15
|
+
|
|
16
|
+
return strings.TrimSpace(markdownContent)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// basicHTMLToMarkdown provides basic HTML to markdown conversion
|
|
20
|
+
func basicHTMLToMarkdown(html string) string {
|
|
21
|
+
// Replace common HTML tags with markdown equivalents
|
|
22
|
+
replacements := map[string]string{
|
|
23
|
+
"<h1>": "# ",
|
|
24
|
+
"</h1>": "\n\n",
|
|
25
|
+
"<h2>": "## ",
|
|
26
|
+
"</h2>": "\n\n",
|
|
27
|
+
"<h3>": "### ",
|
|
28
|
+
"</h3>": "\n\n",
|
|
29
|
+
"<h4>": "#### ",
|
|
30
|
+
"</h4>": "\n\n",
|
|
31
|
+
"<h5>": "##### ",
|
|
32
|
+
"</h5>": "\n\n",
|
|
33
|
+
"<h6>": "###### ",
|
|
34
|
+
"</h6>": "\n\n",
|
|
35
|
+
"<p>": "",
|
|
36
|
+
"</p>": "\n\n",
|
|
37
|
+
"<br>": "\n",
|
|
38
|
+
"<br/>": "\n",
|
|
39
|
+
"<strong>": "**",
|
|
40
|
+
"</strong>": "**",
|
|
41
|
+
"<b>": "**",
|
|
42
|
+
"</b>": "**",
|
|
43
|
+
"<em>": "*",
|
|
44
|
+
"</em>": "*",
|
|
45
|
+
"<i>": "*",
|
|
46
|
+
"</i>": "*",
|
|
47
|
+
"<code>": "`",
|
|
48
|
+
"</code>": "`",
|
|
49
|
+
"<pre>": "```",
|
|
50
|
+
"</pre>": "```",
|
|
51
|
+
"<ul>": "",
|
|
52
|
+
"</ul>": "\n",
|
|
53
|
+
"<ol>": "",
|
|
54
|
+
"</ol>": "\n",
|
|
55
|
+
"<li>": "- ",
|
|
56
|
+
"</li>": "\n",
|
|
57
|
+
"<blockquote>": "> ",
|
|
58
|
+
"</blockquote>": "\n\n",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
result := html
|
|
62
|
+
for htmlTag, mdReplacement := range replacements {
|
|
63
|
+
result = strings.ReplaceAll(result, htmlTag, mdReplacement)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Clean up extra whitespace
|
|
67
|
+
result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
|
|
68
|
+
result = strings.TrimSpace(result)
|
|
69
|
+
|
|
70
|
+
return result
|
|
71
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"bufio"
|
|
5
|
+
"fmt"
|
|
6
|
+
"os"
|
|
7
|
+
"strings"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
// GenerateLLMTxt creates an llm.txt file with AI-friendly documentation index
|
|
11
|
+
func GenerateLLMTxt(entries []LLMTxtEntry, outputPath string) error {
|
|
12
|
+
file, err := os.Create(outputPath)
|
|
13
|
+
if err != nil {
|
|
14
|
+
return fmt.Errorf("failed to create llm.txt file: %w", err)
|
|
15
|
+
}
|
|
16
|
+
defer file.Close()
|
|
17
|
+
|
|
18
|
+
writer := bufio.NewWriter(file)
|
|
19
|
+
defer writer.Flush()
|
|
20
|
+
|
|
21
|
+
// Write header
|
|
22
|
+
writer.WriteString("# llm.txt - AI-friendly documentation index\n")
|
|
23
|
+
writer.WriteString("# This file helps LLMs quickly find relevant documentation sections\n\n")
|
|
24
|
+
|
|
25
|
+
for _, entry := range entries {
|
|
26
|
+
// Write entry in the format: [TYPE] Title
|
|
27
|
+
writer.WriteString(fmt.Sprintf("[%s] %s\n",
|
|
28
|
+
strings.ToUpper(entry.Type), entry.Title))
|
|
29
|
+
// Write URL
|
|
30
|
+
writer.WriteString(entry.URL + "\n")
|
|
31
|
+
// Write description
|
|
32
|
+
writer.WriteString(entry.Description + "\n\n")
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return nil
|
|
36
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"net"
|
|
6
|
+
"net/url"
|
|
7
|
+
"path/filepath"
|
|
8
|
+
"strings"
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
// ValidateConfig validates the configuration for security issues
|
|
12
|
+
func ValidateConfig(config *Config) error {
|
|
13
|
+
if err := validateURL(config.BaseURL); err != nil {
|
|
14
|
+
return fmt.Errorf("invalid URL: %w", err)
|
|
15
|
+
}
|
|
16
|
+
if err := validateOutputPath(config.OutputPath); err != nil {
|
|
17
|
+
return fmt.Errorf("invalid output path: %w", err)
|
|
18
|
+
}
|
|
19
|
+
if config.MaxDepth > 10 {
|
|
20
|
+
return fmt.Errorf("max depth too high (maximum allowed: 10)")
|
|
21
|
+
}
|
|
22
|
+
if config.Workers > 20 {
|
|
23
|
+
return fmt.Errorf("too many concurrent workers (maximum allowed: 20)")
|
|
24
|
+
}
|
|
25
|
+
return nil
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// validateURL checks if the URL is safe to fetch
|
|
29
|
+
func validateURL(urlStr string) error {
|
|
30
|
+
parsed, err := url.Parse(urlStr)
|
|
31
|
+
if err != nil {
|
|
32
|
+
return err
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Only allow HTTP and HTTPS
|
|
36
|
+
if parsed.Scheme != "http" && parsed.Scheme != "https" {
|
|
37
|
+
return fmt.Errorf("only HTTP and HTTPS URLs are allowed")
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Block private IP ranges and localhost
|
|
41
|
+
host := parsed.Hostname()
|
|
42
|
+
ip := net.ParseIP(host)
|
|
43
|
+
if ip != nil {
|
|
44
|
+
if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
|
|
45
|
+
return fmt.Errorf("access to private/internal IP addresses is not allowed")
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Block dangerous hostnames
|
|
50
|
+
dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
|
|
51
|
+
for _, dangerous := range dangerousHosts {
|
|
52
|
+
if strings.ToLower(host) == dangerous {
|
|
53
|
+
return fmt.Errorf("access to localhost is not allowed")
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return nil
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// validateOutputPath ensures the output path is safe
|
|
61
|
+
func validateOutputPath(path string) error {
|
|
62
|
+
// Don't allow absolute paths that start with /
|
|
63
|
+
if strings.HasPrefix(path, "/") {
|
|
64
|
+
return fmt.Errorf("absolute paths are not allowed")
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Don't allow paths that contain ..
|
|
68
|
+
if strings.Contains(path, "..") {
|
|
69
|
+
return fmt.Errorf("relative path traversal (..) is not allowed")
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Don't allow paths that contain ~
|
|
73
|
+
if strings.Contains(path, "~") {
|
|
74
|
+
return fmt.Errorf("home directory expansion (~) is not allowed")
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Resolve to absolute path to check final destination
|
|
78
|
+
absPath, err := filepath.Abs(path)
|
|
79
|
+
if err != nil {
|
|
80
|
+
return err
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Get current working directory
|
|
84
|
+
cwd, err := filepath.Abs(".")
|
|
85
|
+
if err != nil {
|
|
86
|
+
return err
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Ensure the absolute path is within the current working directory
|
|
90
|
+
if !strings.HasPrefix(absPath, cwd) {
|
|
91
|
+
return fmt.Errorf("output path must be within the current working directory")
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Check file extension - only allow safe extensions
|
|
95
|
+
allowedExtensions := []string{".md", ".txt", ".llm.txt"}
|
|
96
|
+
ext := filepath.Ext(path)
|
|
97
|
+
isAllowed := false
|
|
98
|
+
for _, allowed := range allowedExtensions {
|
|
99
|
+
if ext == allowed {
|
|
100
|
+
isAllowed = true
|
|
101
|
+
break
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
if !isAllowed {
|
|
105
|
+
return fmt.Errorf("only .md, .txt, and .llm.txt file extensions are allowed")
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return nil
|
|
109
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
package fetcher
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"bufio"
|
|
5
|
+
"os"
|
|
6
|
+
"strings"
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
// writeResults writes the fetched documentation to the output file
|
|
10
|
+
func writeResults(outputPath string, resultsChan <-chan string) error {
|
|
11
|
+
file, err := os.Create(outputPath)
|
|
12
|
+
if err != nil {
|
|
13
|
+
return err
|
|
14
|
+
}
|
|
15
|
+
defer file.Close()
|
|
16
|
+
|
|
17
|
+
writer := bufio.NewWriter(file)
|
|
18
|
+
defer writer.Flush()
|
|
19
|
+
|
|
20
|
+
// Write header
|
|
21
|
+
header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
|
|
22
|
+
writer.WriteString(header)
|
|
23
|
+
|
|
24
|
+
// Write all results
|
|
25
|
+
for result := range resultsChan {
|
|
26
|
+
if strings.TrimSpace(result) != "" {
|
|
27
|
+
writer.WriteString(result)
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
return nil
|
|
32
|
+
}
|
package/pyproject.toml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "doc-fetch"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 5 - Production/Stable",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.8",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Documentation",
|
|
24
|
+
"Topic :: Software Development :: Documentation",
|
|
25
|
+
"Topic :: Utilities",
|
|
26
|
+
]
|
|
27
|
+
keywords = ["documentation", "ai", "llm", "markdown", "crawler", "security"]
|
|
28
|
+
requires-python = ">=3.8"
|
|
29
|
+
dependencies = []
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/AlphaTechini/doc-fetch"
|
|
33
|
+
Repository = "https://github.com/AlphaTechini/doc-fetch"
|
|
34
|
+
Documentation = "https://github.com/AlphaTechini/doc-fetch#readme"
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
doc-fetch = "doc_fetch.cli:main"
|