doc-fetch-cli 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTALLATION-FIX.md +242 -0
- package/bin/doc-fetch.js +47 -9
- package/bin/postinstall.js +88 -0
- package/package.json +11 -4
- package/SECURITY.md +0 -84
- package/cmd/docfetch/main.go +0 -55
- package/dist/doc_fetch-1.1.0-py3-none-any.whl +0 -0
- package/dist/doc_fetch-1.1.0.tar.gz +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/doc_fetch/__init__.py +0 -6
- package/doc_fetch/__main__.py +0 -7
- package/doc_fetch/cli.py +0 -113
- package/doc_fetch.egg-info/PKG-INFO +0 -224
- package/doc_fetch.egg-info/SOURCES.txt +0 -19
- package/doc_fetch.egg-info/dependency_links.txt +0 -1
- package/doc_fetch.egg-info/entry_points.txt +0 -2
- package/doc_fetch.egg-info/not-zip-safe +0 -1
- package/doc_fetch.egg-info/top_level.txt +0 -1
- package/docs/usage.md +0 -67
- package/examples/golang-example.sh +0 -12
- package/go.sum +0 -38
- package/pkg/fetcher/classifier.go +0 -50
- package/pkg/fetcher/describer.go +0 -61
- package/pkg/fetcher/fetcher.go +0 -415
- package/pkg/fetcher/fetcher_optimized.go +0 -318
- package/pkg/fetcher/html2md.go +0 -71
- package/pkg/fetcher/llmtxt.go +0 -36
- package/pkg/fetcher/validator.go +0 -109
- package/pkg/fetcher/writer.go +0 -32
- package/pyproject.toml +0 -37
- package/setup.py +0 -158
|
@@ -1,318 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"bufio"
|
|
5
|
-
"context"
|
|
6
|
-
"fmt"
|
|
7
|
-
"log"
|
|
8
|
-
"net"
|
|
9
|
-
"net/http"
|
|
10
|
-
"net/url"
|
|
11
|
-
"os"
|
|
12
|
-
"strings"
|
|
13
|
-
"sync"
|
|
14
|
-
"sync/atomic"
|
|
15
|
-
"time"
|
|
16
|
-
|
|
17
|
-
"github.com/PuerkitoBio/goquery"
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
// OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
|
|
21
|
-
type OptimizedFetcher struct {
|
|
22
|
-
config Config
|
|
23
|
-
httpClient *http.Client
|
|
24
|
-
urlQueue chan string
|
|
25
|
-
visited sync.Map // Concurrent map instead of mutex-protected map
|
|
26
|
-
resultsChan chan string
|
|
27
|
-
llmEntries []LLMTxtEntry
|
|
28
|
-
llmMutex sync.Mutex
|
|
29
|
-
pageCount int32
|
|
30
|
-
errorCount int32
|
|
31
|
-
ctx context.Context
|
|
32
|
-
cancel context.CancelFunc
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
// RunOptimized executes documentation fetching with maximum concurrency
|
|
36
|
-
func RunOptimized(config Config) error {
|
|
37
|
-
if err := validateConfig(&config); err != nil {
|
|
38
|
-
return fmt.Errorf("invalid configuration: %w", err)
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
|
|
42
|
-
log.Printf(" Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
|
|
43
|
-
|
|
44
|
-
fetcher := &OptimizedFetcher{
|
|
45
|
-
config: config,
|
|
46
|
-
urlQueue: make(chan string, config.Workers*100), // Large buffer for URLs
|
|
47
|
-
resultsChan: make(chan string, config.Workers*10), // Larger buffer
|
|
48
|
-
httpClient: createOptimizedHTTPClient(config.Workers),
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
|
|
52
|
-
defer fetcher.cancel()
|
|
53
|
-
|
|
54
|
-
startTime := time.Now()
|
|
55
|
-
|
|
56
|
-
// Start result writer in background
|
|
57
|
-
var writeWg sync.WaitGroup
|
|
58
|
-
writeWg.Add(1)
|
|
59
|
-
go func() {
|
|
60
|
-
defer writeWg.Add(-1)
|
|
61
|
-
writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
|
|
62
|
-
}()
|
|
63
|
-
|
|
64
|
-
// Start worker pool
|
|
65
|
-
var workerWg sync.WaitGroup
|
|
66
|
-
for i := 0; i < config.Workers; i++ {
|
|
67
|
-
workerWg.Add(1)
|
|
68
|
-
go fetcher.worker(i, &workerWg)
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// Submit initial URL
|
|
72
|
-
fetcher.submitPage(config.BaseURL, 0)
|
|
73
|
-
|
|
74
|
-
// Close URL queue when all pages are processed
|
|
75
|
-
go func() {
|
|
76
|
-
workerWg.Wait()
|
|
77
|
-
close(fetcher.urlQueue)
|
|
78
|
-
}()
|
|
79
|
-
|
|
80
|
-
// Wait for all workers to complete
|
|
81
|
-
workerWg.Wait()
|
|
82
|
-
close(fetcher.resultsChan)
|
|
83
|
-
|
|
84
|
-
// Wait for results to be written
|
|
85
|
-
writeWg.Wait()
|
|
86
|
-
|
|
87
|
-
elapsed := time.Since(startTime)
|
|
88
|
-
pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
|
|
89
|
-
errors := atomic.LoadInt32(&fetcher.errorCount)
|
|
90
|
-
|
|
91
|
-
log.Printf("✅ Fetch completed!")
|
|
92
|
-
log.Printf(" 📊 Pages fetched: %d", pagesFetched)
|
|
93
|
-
log.Printf(" ⏱️ Time elapsed: %v", elapsed)
|
|
94
|
-
log.Printf(" 📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
|
|
95
|
-
log.Printf(" ❌ Errors: %d", errors)
|
|
96
|
-
|
|
97
|
-
// Generate LLM.txt if requested
|
|
98
|
-
if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
|
|
99
|
-
llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
|
|
100
|
-
if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
|
|
101
|
-
log.Printf("⚠️ Warning: Failed to generate llm.txt: %v", err)
|
|
102
|
-
} else {
|
|
103
|
-
log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
return nil
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
// createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
|
|
111
|
-
func createOptimizedHTTPClient(workers int) *http.Client {
|
|
112
|
-
return &http.Client{
|
|
113
|
-
Timeout: 30 * time.Second,
|
|
114
|
-
Transport: &http.Transport{
|
|
115
|
-
MaxIdleConns: workers * 2,
|
|
116
|
-
MaxIdleConnsPerHost: workers,
|
|
117
|
-
IdleConnTimeout: 90 * time.Second,
|
|
118
|
-
DisableCompression: false,
|
|
119
|
-
DisableKeepAlives: false,
|
|
120
|
-
DialContext: (&net.Dialer{
|
|
121
|
-
Timeout: 10 * time.Second,
|
|
122
|
-
KeepAlive: 30 * time.Second,
|
|
123
|
-
}).DialContext,
|
|
124
|
-
TLSHandshakeTimeout: 10 * time.Second,
|
|
125
|
-
},
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
// worker processes URLs from the submission queue
|
|
130
|
-
func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
|
|
131
|
-
defer wg.Done()
|
|
132
|
-
|
|
133
|
-
for url := range f.urlQueue {
|
|
134
|
-
select {
|
|
135
|
-
case <-f.ctx.Done():
|
|
136
|
-
return
|
|
137
|
-
default:
|
|
138
|
-
f.processURL(url, 0)
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
// submitPage adds a URL to be fetched (with depth tracking)
|
|
144
|
-
func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
|
|
145
|
-
if depth > f.config.MaxDepth {
|
|
146
|
-
return
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// Check if already visited using atomic operation
|
|
150
|
-
if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
|
|
151
|
-
return
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
select {
|
|
155
|
-
case f.urlQueue <- pageURL:
|
|
156
|
-
// Successfully queued
|
|
157
|
-
default:
|
|
158
|
-
// Queue full, skip this URL
|
|
159
|
-
log.Printf("⚠️ Queue full, skipping: %s", pageURL)
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// processURL fetches and processes a single URL
|
|
164
|
-
func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
|
|
165
|
-
atomic.AddInt32(&f.pageCount, 1)
|
|
166
|
-
|
|
167
|
-
startTime := time.Now()
|
|
168
|
-
|
|
169
|
-
// Validate URL
|
|
170
|
-
if err := isValidURL(pageURL); err != nil {
|
|
171
|
-
atomic.AddInt32(&f.errorCount, 1)
|
|
172
|
-
log.Printf("❌ Invalid URL %s: %v", pageURL, err)
|
|
173
|
-
return
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
// Fetch the page
|
|
177
|
-
resp, err := f.httpClient.Get(pageURL)
|
|
178
|
-
if err != nil {
|
|
179
|
-
atomic.AddInt32(&f.errorCount, 1)
|
|
180
|
-
log.Printf("❌ Error fetching %s: %v", pageURL, err)
|
|
181
|
-
return
|
|
182
|
-
}
|
|
183
|
-
defer resp.Body.Close()
|
|
184
|
-
|
|
185
|
-
if resp.StatusCode != 200 {
|
|
186
|
-
atomic.AddInt32(&f.errorCount, 1)
|
|
187
|
-
log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
|
|
188
|
-
return
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
// Parse HTML concurrently
|
|
192
|
-
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
193
|
-
if err != nil {
|
|
194
|
-
atomic.AddInt32(&f.errorCount, 1)
|
|
195
|
-
log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
|
|
196
|
-
return
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// Extract content
|
|
200
|
-
content := cleanContent(doc)
|
|
201
|
-
if content == "" {
|
|
202
|
-
atomic.AddInt32(&f.errorCount, 1)
|
|
203
|
-
log.Printf("⚠️ No content found for %s", pageURL)
|
|
204
|
-
return
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
// Extract title
|
|
208
|
-
title := doc.Find("title").Text()
|
|
209
|
-
if title == "" {
|
|
210
|
-
title = pageURL
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// Send result
|
|
214
|
-
f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
|
|
215
|
-
|
|
216
|
-
// Generate LLM.txt entry if requested
|
|
217
|
-
if f.config.GenerateLLMTxt {
|
|
218
|
-
cleanTitle := CleanTitle(title)
|
|
219
|
-
entryType := ClassifyPage(pageURL, cleanTitle)
|
|
220
|
-
description := ExtractDescription(content)
|
|
221
|
-
|
|
222
|
-
entry := LLMTxtEntry{
|
|
223
|
-
Type: entryType,
|
|
224
|
-
Title: cleanTitle,
|
|
225
|
-
URL: pageURL,
|
|
226
|
-
Description: description,
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
f.llmMutex.Lock()
|
|
230
|
-
f.llmEntries = append(f.llmEntries, entry)
|
|
231
|
-
f.llmMutex.Unlock()
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
// Extract links for crawling (if depth allows)
|
|
235
|
-
if depth < f.config.MaxDepth {
|
|
236
|
-
f.extractAndSubmitLinks(doc, pageURL, depth+1)
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
elapsed := time.Since(startTime)
|
|
240
|
-
log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
// extractAndSubmitLinks finds and queues all internal links
|
|
244
|
-
func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
|
|
245
|
-
base, err := url.Parse(baseURL)
|
|
246
|
-
if err != nil {
|
|
247
|
-
return
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
|
251
|
-
href, exists := s.Attr("href")
|
|
252
|
-
if !exists {
|
|
253
|
-
return
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
// Resolve relative URLs
|
|
257
|
-
resolvedURL, err := base.Parse(href)
|
|
258
|
-
if err != nil {
|
|
259
|
-
return
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
// Only follow same-domain links
|
|
263
|
-
if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
|
|
264
|
-
return
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
// Skip non-HTML resources
|
|
268
|
-
if isNonHTMLResource(resolvedURL.Path) {
|
|
269
|
-
return
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
f.submitPage(resolvedURL.String(), depth)
|
|
273
|
-
})
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
// isNonHTMLResource checks if URL points to non-HTML resources
|
|
277
|
-
func isNonHTMLResource(path string) bool {
|
|
278
|
-
extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
|
|
279
|
-
pathLower := strings.ToLower(path)
|
|
280
|
-
|
|
281
|
-
for _, ext := range extensions {
|
|
282
|
-
if strings.HasSuffix(pathLower, ext) {
|
|
283
|
-
return true
|
|
284
|
-
}
|
|
285
|
-
}
|
|
286
|
-
return false
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
// writeResultsOptimized writes results to file efficiently
|
|
290
|
-
func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
|
|
291
|
-
file, err := os.Create(outputPath)
|
|
292
|
-
if err != nil {
|
|
293
|
-
return err
|
|
294
|
-
}
|
|
295
|
-
defer file.Close()
|
|
296
|
-
|
|
297
|
-
writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
|
|
298
|
-
defer writer.Flush()
|
|
299
|
-
|
|
300
|
-
// Write header
|
|
301
|
-
header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
|
|
302
|
-
writer.WriteString(header)
|
|
303
|
-
|
|
304
|
-
count := 0
|
|
305
|
-
for result := range resultsChan {
|
|
306
|
-
if strings.TrimSpace(result) != "" {
|
|
307
|
-
writer.WriteString(result)
|
|
308
|
-
count++
|
|
309
|
-
|
|
310
|
-
// Flush periodically to avoid memory buildup
|
|
311
|
-
if count%10 == 0 {
|
|
312
|
-
writer.Flush()
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
return nil
|
|
318
|
-
}
|
package/pkg/fetcher/html2md.go
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"strings"
|
|
5
|
-
)
|
|
6
|
-
|
|
7
|
-
// ConvertHTMLToMarkdown converts HTML content to clean markdown
|
|
8
|
-
func ConvertHTMLToMarkdown(htmlContent string) string {
|
|
9
|
-
if htmlContent == "" {
|
|
10
|
-
return ""
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
// Basic HTML to markdown conversion
|
|
14
|
-
markdownContent := basicHTMLToMarkdown(htmlContent)
|
|
15
|
-
|
|
16
|
-
return strings.TrimSpace(markdownContent)
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
// basicHTMLToMarkdown provides basic HTML to markdown conversion
|
|
20
|
-
func basicHTMLToMarkdown(html string) string {
|
|
21
|
-
// Replace common HTML tags with markdown equivalents
|
|
22
|
-
replacements := map[string]string{
|
|
23
|
-
"<h1>": "# ",
|
|
24
|
-
"</h1>": "\n\n",
|
|
25
|
-
"<h2>": "## ",
|
|
26
|
-
"</h2>": "\n\n",
|
|
27
|
-
"<h3>": "### ",
|
|
28
|
-
"</h3>": "\n\n",
|
|
29
|
-
"<h4>": "#### ",
|
|
30
|
-
"</h4>": "\n\n",
|
|
31
|
-
"<h5>": "##### ",
|
|
32
|
-
"</h5>": "\n\n",
|
|
33
|
-
"<h6>": "###### ",
|
|
34
|
-
"</h6>": "\n\n",
|
|
35
|
-
"<p>": "",
|
|
36
|
-
"</p>": "\n\n",
|
|
37
|
-
"<br>": "\n",
|
|
38
|
-
"<br/>": "\n",
|
|
39
|
-
"<strong>": "**",
|
|
40
|
-
"</strong>": "**",
|
|
41
|
-
"<b>": "**",
|
|
42
|
-
"</b>": "**",
|
|
43
|
-
"<em>": "*",
|
|
44
|
-
"</em>": "*",
|
|
45
|
-
"<i>": "*",
|
|
46
|
-
"</i>": "*",
|
|
47
|
-
"<code>": "`",
|
|
48
|
-
"</code>": "`",
|
|
49
|
-
"<pre>": "```",
|
|
50
|
-
"</pre>": "```",
|
|
51
|
-
"<ul>": "",
|
|
52
|
-
"</ul>": "\n",
|
|
53
|
-
"<ol>": "",
|
|
54
|
-
"</ol>": "\n",
|
|
55
|
-
"<li>": "- ",
|
|
56
|
-
"</li>": "\n",
|
|
57
|
-
"<blockquote>": "> ",
|
|
58
|
-
"</blockquote>": "\n\n",
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
result := html
|
|
62
|
-
for htmlTag, mdReplacement := range replacements {
|
|
63
|
-
result = strings.ReplaceAll(result, htmlTag, mdReplacement)
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
// Clean up extra whitespace
|
|
67
|
-
result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
|
|
68
|
-
result = strings.TrimSpace(result)
|
|
69
|
-
|
|
70
|
-
return result
|
|
71
|
-
}
|
package/pkg/fetcher/llmtxt.go
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"bufio"
|
|
5
|
-
"fmt"
|
|
6
|
-
"os"
|
|
7
|
-
"strings"
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
// GenerateLLMTxt creates an llm.txt file with AI-friendly documentation index
|
|
11
|
-
func GenerateLLMTxt(entries []LLMTxtEntry, outputPath string) error {
|
|
12
|
-
file, err := os.Create(outputPath)
|
|
13
|
-
if err != nil {
|
|
14
|
-
return fmt.Errorf("failed to create llm.txt file: %w", err)
|
|
15
|
-
}
|
|
16
|
-
defer file.Close()
|
|
17
|
-
|
|
18
|
-
writer := bufio.NewWriter(file)
|
|
19
|
-
defer writer.Flush()
|
|
20
|
-
|
|
21
|
-
// Write header
|
|
22
|
-
writer.WriteString("# llm.txt - AI-friendly documentation index\n")
|
|
23
|
-
writer.WriteString("# This file helps LLMs quickly find relevant documentation sections\n\n")
|
|
24
|
-
|
|
25
|
-
for _, entry := range entries {
|
|
26
|
-
// Write entry in the format: [TYPE] Title
|
|
27
|
-
writer.WriteString(fmt.Sprintf("[%s] %s\n",
|
|
28
|
-
strings.ToUpper(entry.Type), entry.Title))
|
|
29
|
-
// Write URL
|
|
30
|
-
writer.WriteString(entry.URL + "\n")
|
|
31
|
-
// Write description
|
|
32
|
-
writer.WriteString(entry.Description + "\n\n")
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
return nil
|
|
36
|
-
}
|
package/pkg/fetcher/validator.go
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"fmt"
|
|
5
|
-
"net"
|
|
6
|
-
"net/url"
|
|
7
|
-
"path/filepath"
|
|
8
|
-
"strings"
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
// ValidateConfig validates the configuration for security issues
|
|
12
|
-
func ValidateConfig(config *Config) error {
|
|
13
|
-
if err := validateURL(config.BaseURL); err != nil {
|
|
14
|
-
return fmt.Errorf("invalid URL: %w", err)
|
|
15
|
-
}
|
|
16
|
-
if err := validateOutputPath(config.OutputPath); err != nil {
|
|
17
|
-
return fmt.Errorf("invalid output path: %w", err)
|
|
18
|
-
}
|
|
19
|
-
if config.MaxDepth > 10 {
|
|
20
|
-
return fmt.Errorf("max depth too high (maximum allowed: 10)")
|
|
21
|
-
}
|
|
22
|
-
if config.Workers > 20 {
|
|
23
|
-
return fmt.Errorf("too many concurrent workers (maximum allowed: 20)")
|
|
24
|
-
}
|
|
25
|
-
return nil
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
// validateURL checks if the URL is safe to fetch
|
|
29
|
-
func validateURL(urlStr string) error {
|
|
30
|
-
parsed, err := url.Parse(urlStr)
|
|
31
|
-
if err != nil {
|
|
32
|
-
return err
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
// Only allow HTTP and HTTPS
|
|
36
|
-
if parsed.Scheme != "http" && parsed.Scheme != "https" {
|
|
37
|
-
return fmt.Errorf("only HTTP and HTTPS URLs are allowed")
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// Block private IP ranges and localhost
|
|
41
|
-
host := parsed.Hostname()
|
|
42
|
-
ip := net.ParseIP(host)
|
|
43
|
-
if ip != nil {
|
|
44
|
-
if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
|
|
45
|
-
return fmt.Errorf("access to private/internal IP addresses is not allowed")
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
// Block dangerous hostnames
|
|
50
|
-
dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
|
|
51
|
-
for _, dangerous := range dangerousHosts {
|
|
52
|
-
if strings.ToLower(host) == dangerous {
|
|
53
|
-
return fmt.Errorf("access to localhost is not allowed")
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
return nil
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// validateOutputPath ensures the output path is safe
|
|
61
|
-
func validateOutputPath(path string) error {
|
|
62
|
-
// Don't allow absolute paths that start with /
|
|
63
|
-
if strings.HasPrefix(path, "/") {
|
|
64
|
-
return fmt.Errorf("absolute paths are not allowed")
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// Don't allow paths that contain ..
|
|
68
|
-
if strings.Contains(path, "..") {
|
|
69
|
-
return fmt.Errorf("relative path traversal (..) is not allowed")
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// Don't allow paths that contain ~
|
|
73
|
-
if strings.Contains(path, "~") {
|
|
74
|
-
return fmt.Errorf("home directory expansion (~) is not allowed")
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
// Resolve to absolute path to check final destination
|
|
78
|
-
absPath, err := filepath.Abs(path)
|
|
79
|
-
if err != nil {
|
|
80
|
-
return err
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
// Get current working directory
|
|
84
|
-
cwd, err := filepath.Abs(".")
|
|
85
|
-
if err != nil {
|
|
86
|
-
return err
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
// Ensure the absolute path is within the current working directory
|
|
90
|
-
if !strings.HasPrefix(absPath, cwd) {
|
|
91
|
-
return fmt.Errorf("output path must be within the current working directory")
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Check file extension - only allow safe extensions
|
|
95
|
-
allowedExtensions := []string{".md", ".txt", ".llm.txt"}
|
|
96
|
-
ext := filepath.Ext(path)
|
|
97
|
-
isAllowed := false
|
|
98
|
-
for _, allowed := range allowedExtensions {
|
|
99
|
-
if ext == allowed {
|
|
100
|
-
isAllowed = true
|
|
101
|
-
break
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
if !isAllowed {
|
|
105
|
-
return fmt.Errorf("only .md, .txt, and .llm.txt file extensions are allowed")
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
return nil
|
|
109
|
-
}
|
package/pkg/fetcher/writer.go
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"bufio"
|
|
5
|
-
"os"
|
|
6
|
-
"strings"
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
// writeResults writes the fetched documentation to the output file
|
|
10
|
-
func writeResults(outputPath string, resultsChan <-chan string) error {
|
|
11
|
-
file, err := os.Create(outputPath)
|
|
12
|
-
if err != nil {
|
|
13
|
-
return err
|
|
14
|
-
}
|
|
15
|
-
defer file.Close()
|
|
16
|
-
|
|
17
|
-
writer := bufio.NewWriter(file)
|
|
18
|
-
defer writer.Flush()
|
|
19
|
-
|
|
20
|
-
// Write header
|
|
21
|
-
header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
|
|
22
|
-
writer.WriteString(header)
|
|
23
|
-
|
|
24
|
-
// Write all results
|
|
25
|
-
for result := range resultsChan {
|
|
26
|
-
if strings.TrimSpace(result) != "" {
|
|
27
|
-
writer.WriteString(result)
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
return nil
|
|
32
|
-
}
|
package/pyproject.toml
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
|
|
3
|
-
build-backend = "setuptools.build_meta"
|
|
4
|
-
|
|
5
|
-
[project]
|
|
6
|
-
name = "doc-fetch"
|
|
7
|
-
version = "1.1.0"
|
|
8
|
-
description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
|
|
9
|
-
readme = "README.md"
|
|
10
|
-
authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
|
|
11
|
-
license = {text = "MIT"}
|
|
12
|
-
classifiers = [
|
|
13
|
-
"Development Status :: 5 - Production/Stable",
|
|
14
|
-
"Intended Audience :: Developers",
|
|
15
|
-
"License :: OSI Approved :: MIT License",
|
|
16
|
-
"Operating System :: OS Independent",
|
|
17
|
-
"Programming Language :: Python :: 3",
|
|
18
|
-
"Programming Language :: Python :: 3.8",
|
|
19
|
-
"Programming Language :: Python :: 3.9",
|
|
20
|
-
"Programming Language :: Python :: 3.10",
|
|
21
|
-
"Programming Language :: Python :: 3.11",
|
|
22
|
-
"Programming Language :: Python :: 3.12",
|
|
23
|
-
"Topic :: Documentation",
|
|
24
|
-
"Topic :: Software Development :: Documentation",
|
|
25
|
-
"Topic :: Utilities",
|
|
26
|
-
]
|
|
27
|
-
keywords = ["documentation", "ai", "llm", "markdown", "crawler", "security"]
|
|
28
|
-
requires-python = ">=3.8"
|
|
29
|
-
dependencies = []
|
|
30
|
-
|
|
31
|
-
[project.urls]
|
|
32
|
-
Homepage = "https://github.com/AlphaTechini/doc-fetch"
|
|
33
|
-
Repository = "https://github.com/AlphaTechini/doc-fetch"
|
|
34
|
-
Documentation = "https://github.com/AlphaTechini/doc-fetch#readme"
|
|
35
|
-
|
|
36
|
-
[project.scripts]
|
|
37
|
-
doc-fetch = "doc_fetch.cli:main"
|