doc-fetch-cli 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTALLATION-FIX.md +242 -0
- package/bin/doc-fetch.js +47 -9
- package/bin/postinstall.js +71 -66
- package/package.json +10 -3
- package/CONTRIBUTING.md +0 -274
- package/SECURITY.md +0 -84
- package/cmd/docfetch/main.go +0 -55
- package/dist/doc_fetch-1.1.1-py3-none-any.whl +0 -0
- package/dist/doc_fetch-1.1.1.tar.gz +0 -0
- package/doc-fetch_darwin_amd64 +0 -0
- package/doc-fetch_windows_amd64.exe +0 -0
- package/doc_fetch/__init__.py +0 -6
- package/doc_fetch/__main__.py +0 -7
- package/doc_fetch/cli.py +0 -113
- package/doc_fetch.egg-info/PKG-INFO +0 -224
- package/doc_fetch.egg-info/SOURCES.txt +0 -36
- package/doc_fetch.egg-info/dependency_links.txt +0 -1
- package/doc_fetch.egg-info/entry_points.txt +0 -2
- package/doc_fetch.egg-info/not-zip-safe +0 -1
- package/doc_fetch.egg-info/top_level.txt +0 -1
- package/docs/usage.md +0 -67
- package/examples/golang-example.sh +0 -12
- package/go.sum +0 -38
- package/pkg/fetcher/classifier.go +0 -50
- package/pkg/fetcher/describer.go +0 -61
- package/pkg/fetcher/extract_nav.go +0 -163
- package/pkg/fetcher/fetcher.go +0 -415
- package/pkg/fetcher/fetcher_optimized.go +0 -318
- package/pkg/fetcher/html2md.go +0 -71
- package/pkg/fetcher/llmtxt.go +0 -36
- package/pkg/fetcher/validator.go +0 -109
- package/pkg/fetcher/writer.go +0 -32
- package/pyproject.toml +0 -37
- package/setup.py +0 -158
package/pkg/fetcher/fetcher.go
DELETED
|
@@ -1,415 +0,0 @@
|
|
|
1
|
-
package fetcher
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"fmt"
|
|
5
|
-
"log"
|
|
6
|
-
"net"
|
|
7
|
-
"net/http"
|
|
8
|
-
"net/url"
|
|
9
|
-
"strings"
|
|
10
|
-
"sync"
|
|
11
|
-
"time"
|
|
12
|
-
|
|
13
|
-
"github.com/PuerkitoBio/goquery"
|
|
14
|
-
"golang.org/x/net/html"
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
// Config holds the configuration for the documentation fetcher
|
|
18
|
-
type Config struct {
|
|
19
|
-
BaseURL string
|
|
20
|
-
OutputPath string
|
|
21
|
-
MaxDepth int
|
|
22
|
-
Workers int
|
|
23
|
-
UserAgent string
|
|
24
|
-
GenerateLLMTxt bool
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// Page represents a fetched documentation page
|
|
28
|
-
type Page struct {
|
|
29
|
-
URL string
|
|
30
|
-
Title string
|
|
31
|
-
Content string
|
|
32
|
-
Links []string
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
// LLMTxtEntry represents an entry in the llm.txt file
|
|
36
|
-
type LLMTxtEntry struct {
|
|
37
|
-
Type string
|
|
38
|
-
Title string
|
|
39
|
-
URL string
|
|
40
|
-
Description string
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
// Run executes the documentation fetching process
|
|
44
|
-
func Run(config Config) error {
|
|
45
|
-
// Validate configuration
|
|
46
|
-
if err := validateConfig(&config); err != nil {
|
|
47
|
-
return fmt.Errorf("invalid configuration: %w", err)
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
log.Printf("Starting documentation fetch from: %s", config.BaseURL)
|
|
51
|
-
|
|
52
|
-
// Create a visited map to avoid duplicate fetching
|
|
53
|
-
visited := make(map[string]bool)
|
|
54
|
-
var mutex sync.Mutex
|
|
55
|
-
|
|
56
|
-
// Create channel for pages and results
|
|
57
|
-
pagesChan := make(chan *Page, config.Workers*2)
|
|
58
|
-
resultsChan := make(chan string, config.Workers*2)
|
|
59
|
-
var llmEntries []LLMTxtEntry
|
|
60
|
-
|
|
61
|
-
// Start worker goroutines
|
|
62
|
-
var wg sync.WaitGroup
|
|
63
|
-
for i := 0; i < config.Workers; i++ {
|
|
64
|
-
wg.Add(1)
|
|
65
|
-
go func() {
|
|
66
|
-
defer wg.Done()
|
|
67
|
-
worker(config, pagesChan, resultsChan, &mutex, visited, &llmEntries)
|
|
68
|
-
}()
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// Start the initial fetch
|
|
72
|
-
pagesChan <- &Page{URL: config.BaseURL, Title: "Root"}
|
|
73
|
-
|
|
74
|
-
// Close pages channel when all workers are done
|
|
75
|
-
go func() {
|
|
76
|
-
wg.Wait()
|
|
77
|
-
close(pagesChan)
|
|
78
|
-
close(resultsChan)
|
|
79
|
-
}()
|
|
80
|
-
|
|
81
|
-
// Collect results and write to file
|
|
82
|
-
err := writeResults(config.OutputPath, resultsChan)
|
|
83
|
-
if err != nil {
|
|
84
|
-
return err
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Generate LLM.txt if requested
|
|
88
|
-
if config.GenerateLLMTxt {
|
|
89
|
-
llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
|
|
90
|
-
err = GenerateLLMTxt(llmEntries, llmTxtPath)
|
|
91
|
-
if err != nil {
|
|
92
|
-
log.Printf("Warning: Failed to generate llm.txt: %v", err)
|
|
93
|
-
} else {
|
|
94
|
-
log.Printf("LLM.txt generated: %s", llmTxtPath)
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
return nil
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// worker processes pages from the channel
|
|
102
|
-
func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mutex *sync.Mutex, visited map[string]bool, llmEntries *[]LLMTxtEntry) {
|
|
103
|
-
client := &http.Client{
|
|
104
|
-
Timeout: 30 * time.Second,
|
|
105
|
-
// Add transport with security restrictions
|
|
106
|
-
Transport: &http.Transport{
|
|
107
|
-
DisableKeepAlives: true,
|
|
108
|
-
},
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
for page := range pagesChan {
|
|
112
|
-
// Validate URL before fetching
|
|
113
|
-
if err := isValidURL(page.URL); err != nil {
|
|
114
|
-
log.Printf("Skipping invalid URL %s: %v", page.URL, err)
|
|
115
|
-
continue
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
mutex.Lock()
|
|
119
|
-
if visited[page.URL] {
|
|
120
|
-
mutex.Unlock()
|
|
121
|
-
continue
|
|
122
|
-
}
|
|
123
|
-
visited[page.URL] = true
|
|
124
|
-
mutex.Unlock()
|
|
125
|
-
|
|
126
|
-
log.Printf("Fetching: %s", page.URL)
|
|
127
|
-
|
|
128
|
-
// Rate limiting - be respectful to servers
|
|
129
|
-
time.Sleep(100 * time.Millisecond)
|
|
130
|
-
|
|
131
|
-
// Fetch the page
|
|
132
|
-
req, err := http.NewRequest("GET", page.URL, nil)
|
|
133
|
-
if err != nil {
|
|
134
|
-
log.Printf("Error creating request for %s: %v", page.URL, err)
|
|
135
|
-
continue
|
|
136
|
-
}
|
|
137
|
-
req.Header.Set("User-Agent", config.UserAgent)
|
|
138
|
-
|
|
139
|
-
resp, err := client.Do(req)
|
|
140
|
-
if err != nil {
|
|
141
|
-
log.Printf("Error fetching %s: %v", page.URL, err)
|
|
142
|
-
continue
|
|
143
|
-
}
|
|
144
|
-
defer resp.Body.Close()
|
|
145
|
-
|
|
146
|
-
if resp.StatusCode != 200 {
|
|
147
|
-
log.Printf("Non-200 status code %d for %s", resp.StatusCode, page.URL)
|
|
148
|
-
continue
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
// Parse HTML
|
|
152
|
-
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
153
|
-
if err != nil {
|
|
154
|
-
log.Printf("Error parsing HTML for %s: %v", page.URL, err)
|
|
155
|
-
continue
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// Extract title
|
|
159
|
-
title := doc.Find("title").Text()
|
|
160
|
-
if title == "" {
|
|
161
|
-
title = page.URL
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
// Clean and extract content
|
|
165
|
-
content := cleanContent(doc)
|
|
166
|
-
if content == "" {
|
|
167
|
-
log.Printf("No content found for %s", page.URL)
|
|
168
|
-
continue
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
// Send result to output
|
|
172
|
-
resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
|
|
173
|
-
|
|
174
|
-
// Generate LLM.txt entry if requested
|
|
175
|
-
if config.GenerateLLMTxt {
|
|
176
|
-
cleanTitle := CleanTitle(title)
|
|
177
|
-
entryType := ClassifyPage(page.URL, cleanTitle)
|
|
178
|
-
description := ExtractDescription(content)
|
|
179
|
-
|
|
180
|
-
entry := LLMTxtEntry{
|
|
181
|
-
Type: entryType,
|
|
182
|
-
Title: cleanTitle,
|
|
183
|
-
URL: page.URL,
|
|
184
|
-
Description: description,
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
mutex.Lock()
|
|
188
|
-
*llmEntries = append(*llmEntries, entry)
|
|
189
|
-
mutex.Unlock()
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
// Extract links for further crawling (limited depth logic would go here)
|
|
193
|
-
// For MVP, we'll just fetch the main page
|
|
194
|
-
// Future: implement link extraction and recursive crawling
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
// cleanContent extracts and cleans the main documentation content using multiple strategies
|
|
199
|
-
func cleanContent(doc *goquery.Document) string {
|
|
200
|
-
// Strategy 1: Try semantic HTML5 elements (most reliable)
|
|
201
|
-
semanticSelectors := []string{
|
|
202
|
-
"main",
|
|
203
|
-
"article",
|
|
204
|
-
"[role='main']",
|
|
205
|
-
"[role='article']",
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
for _, selector := range semanticSelectors {
|
|
209
|
-
if el := doc.Find(selector); el.Length() > 0 {
|
|
210
|
-
content := extractTextContent(el)
|
|
211
|
-
if len(content) > 200 { // Minimum viable content
|
|
212
|
-
return content
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// Strategy 2: Try common class/id patterns
|
|
218
|
-
classSelectors := []string{
|
|
219
|
-
".content",
|
|
220
|
-
".docs-content",
|
|
221
|
-
"#main-content",
|
|
222
|
-
".documentation",
|
|
223
|
-
".post-content",
|
|
224
|
-
".markdown-body",
|
|
225
|
-
".content-wrapper",
|
|
226
|
-
".doc-content",
|
|
227
|
-
".document",
|
|
228
|
-
".entry-content",
|
|
229
|
-
".page-content",
|
|
230
|
-
".article-content",
|
|
231
|
-
"[class*='content']",
|
|
232
|
-
"[class*='docs']",
|
|
233
|
-
"[class*='document']",
|
|
234
|
-
"[id*='content']",
|
|
235
|
-
"[id*='main']",
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
for _, selector := range classSelectors {
|
|
239
|
-
if el := doc.Find(selector); el.Length() > 0 {
|
|
240
|
-
content := extractTextContent(el)
|
|
241
|
-
if len(content) > 200 {
|
|
242
|
-
return content
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// Strategy 3: Look for sections with high text density
|
|
248
|
-
var bestSection *goquery.Selection
|
|
249
|
-
maxTextLen := 0
|
|
250
|
-
|
|
251
|
-
doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
|
|
252
|
-
text := strings.TrimSpace(s.Text())
|
|
253
|
-
if len(text) > maxTextLen {
|
|
254
|
-
// Check if this section has more text than child elements
|
|
255
|
-
childText := 0
|
|
256
|
-
s.Children().Each(func(j int, c *goquery.Selection) {
|
|
257
|
-
childText += len(strings.TrimSpace(c.Text()))
|
|
258
|
-
})
|
|
259
|
-
|
|
260
|
-
// If parent has significantly more text, it's likely the main content
|
|
261
|
-
if len(text) > childText + (childText/2) && len(text) > 500 {
|
|
262
|
-
maxTextLen = len(text)
|
|
263
|
-
bestSection = s
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
})
|
|
267
|
-
|
|
268
|
-
if bestSection != nil {
|
|
269
|
-
content := extractTextContent(bestSection)
|
|
270
|
-
if len(content) > 200 {
|
|
271
|
-
return content
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
// Strategy 4: Fallback to body with aggressive cleaning
|
|
276
|
-
body := doc.Find("body")
|
|
277
|
-
if body.Length() > 0 {
|
|
278
|
-
// Remove all non-content elements aggressively
|
|
279
|
-
body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
|
|
280
|
-
|
|
281
|
-
// Find the largest remaining container
|
|
282
|
-
var largest *goquery.Selection
|
|
283
|
-
largestSize := 0
|
|
284
|
-
|
|
285
|
-
body.Find("*").Each(func(i int, s *goquery.Selection) {
|
|
286
|
-
text := strings.TrimSpace(s.Text())
|
|
287
|
-
if len(text) > largestSize && s.Children().Length() < 50 {
|
|
288
|
-
largestSize = len(text)
|
|
289
|
-
largest = s
|
|
290
|
-
}
|
|
291
|
-
})
|
|
292
|
-
|
|
293
|
-
if largest != nil {
|
|
294
|
-
content := extractTextContent(largest)
|
|
295
|
-
if len(content) > 200 {
|
|
296
|
-
return content
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
// Last resort: entire body
|
|
301
|
-
htmlContent, _ := body.Html()
|
|
302
|
-
cleaned := cleanHTML(htmlContent)
|
|
303
|
-
if len(cleaned) > 200 {
|
|
304
|
-
return cleaned
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
return ""
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
// extractTextContent extracts and cleans text from a selection
|
|
312
|
-
func extractTextContent(sel *goquery.Selection) string {
|
|
313
|
-
// Clone the selection to avoid modifying original
|
|
314
|
-
clone := sel.Clone()
|
|
315
|
-
|
|
316
|
-
// Remove unwanted elements
|
|
317
|
-
clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
|
|
318
|
-
|
|
319
|
-
// Get HTML and convert to clean text
|
|
320
|
-
htmlContent, err := clone.Html()
|
|
321
|
-
if err != nil {
|
|
322
|
-
return ""
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
return cleanHTML(htmlContent)
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
// cleanHTML performs basic HTML cleaning
|
|
329
|
-
func cleanHTML(htmlStr string) string {
|
|
330
|
-
// Parse and extract text content while preserving structure
|
|
331
|
-
doc, err := html.Parse(strings.NewReader(htmlStr))
|
|
332
|
-
if err != nil {
|
|
333
|
-
return ""
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
var texts []string
|
|
337
|
-
var extractText func(*html.Node)
|
|
338
|
-
extractText = func(n *html.Node) {
|
|
339
|
-
if n.Type == html.TextNode {
|
|
340
|
-
text := strings.TrimSpace(n.Data)
|
|
341
|
-
if text != "" {
|
|
342
|
-
texts = append(texts, text)
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
346
|
-
extractText(c)
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
extractText(doc)
|
|
351
|
-
return strings.Join(texts, "\n")
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
// isValidURL validates that a URL is safe to fetch
|
|
355
|
-
func isValidURL(urlStr string) error {
|
|
356
|
-
parsed, err := url.Parse(urlStr)
|
|
357
|
-
if err != nil {
|
|
358
|
-
return fmt.Errorf("invalid URL format: %w", err)
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
// Only allow HTTP/HTTPS
|
|
362
|
-
if parsed.Scheme != "http" && parsed.Scheme != "https" {
|
|
363
|
-
return fmt.Errorf("only HTTP/HTTPS URLs allowed")
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
// Block private IP ranges
|
|
367
|
-
host := parsed.Hostname()
|
|
368
|
-
ip := net.ParseIP(host)
|
|
369
|
-
if ip != nil {
|
|
370
|
-
if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
|
|
371
|
-
return fmt.Errorf("private/internal IP addresses not allowed")
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
// Block dangerous hostnames
|
|
376
|
-
dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
|
|
377
|
-
for _, dangerous := range dangerousHosts {
|
|
378
|
-
if host == dangerous {
|
|
379
|
-
return fmt.Errorf("local hostnames not allowed")
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
return nil
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
// validateConfig validates the entire configuration
|
|
387
|
-
func validateConfig(config *Config) error {
|
|
388
|
-
if err := validateOutputPath(config.OutputPath); err != nil {
|
|
389
|
-
return fmt.Errorf("output path validation failed: %w", err)
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
if err := isValidURL(config.BaseURL); err != nil {
|
|
393
|
-
return fmt.Errorf("base URL validation failed: %w", err)
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
// Limit depth to prevent excessive crawling
|
|
397
|
-
if config.MaxDepth > 10 {
|
|
398
|
-
return fmt.Errorf("max depth cannot exceed 10")
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
// Limit workers to prevent resource exhaustion
|
|
402
|
-
if config.Workers > 20 {
|
|
403
|
-
return fmt.Errorf("concurrent workers cannot exceed 20")
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// Ensure reasonable timeout values
|
|
407
|
-
if config.Workers <= 0 {
|
|
408
|
-
config.Workers = 3 // Default
|
|
409
|
-
}
|
|
410
|
-
if config.MaxDepth <= 0 {
|
|
411
|
-
config.MaxDepth = 2 // Default
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
return nil
|
|
415
|
-
}
|