doc-fetch-cli 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,415 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "fmt"
5
- "log"
6
- "net"
7
- "net/http"
8
- "net/url"
9
- "strings"
10
- "sync"
11
- "time"
12
-
13
- "github.com/PuerkitoBio/goquery"
14
- "golang.org/x/net/html"
15
- )
16
-
17
- // Config holds the configuration for the documentation fetcher
18
- type Config struct {
19
- BaseURL string
20
- OutputPath string
21
- MaxDepth int
22
- Workers int
23
- UserAgent string
24
- GenerateLLMTxt bool
25
- }
26
-
27
- // Page represents a fetched documentation page
28
- type Page struct {
29
- URL string
30
- Title string
31
- Content string
32
- Links []string
33
- }
34
-
35
- // LLMTxtEntry represents an entry in the llm.txt file
36
- type LLMTxtEntry struct {
37
- Type string
38
- Title string
39
- URL string
40
- Description string
41
- }
42
-
43
- // Run executes the documentation fetching process
44
- func Run(config Config) error {
45
- // Validate configuration
46
- if err := validateConfig(&config); err != nil {
47
- return fmt.Errorf("invalid configuration: %w", err)
48
- }
49
-
50
- log.Printf("Starting documentation fetch from: %s", config.BaseURL)
51
-
52
- // Create a visited map to avoid duplicate fetching
53
- visited := make(map[string]bool)
54
- var mutex sync.Mutex
55
-
56
- // Create channel for pages and results
57
- pagesChan := make(chan *Page, config.Workers*2)
58
- resultsChan := make(chan string, config.Workers*2)
59
- var llmEntries []LLMTxtEntry
60
-
61
- // Start worker goroutines
62
- var wg sync.WaitGroup
63
- for i := 0; i < config.Workers; i++ {
64
- wg.Add(1)
65
- go func() {
66
- defer wg.Done()
67
- worker(config, pagesChan, resultsChan, &mutex, visited, &llmEntries)
68
- }()
69
- }
70
-
71
- // Start the initial fetch
72
- pagesChan <- &Page{URL: config.BaseURL, Title: "Root"}
73
-
74
- // Close pages channel when all workers are done
75
- go func() {
76
- wg.Wait()
77
- close(pagesChan)
78
- close(resultsChan)
79
- }()
80
-
81
- // Collect results and write to file
82
- err := writeResults(config.OutputPath, resultsChan)
83
- if err != nil {
84
- return err
85
- }
86
-
87
- // Generate LLM.txt if requested
88
- if config.GenerateLLMTxt {
89
- llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
90
- err = GenerateLLMTxt(llmEntries, llmTxtPath)
91
- if err != nil {
92
- log.Printf("Warning: Failed to generate llm.txt: %v", err)
93
- } else {
94
- log.Printf("LLM.txt generated: %s", llmTxtPath)
95
- }
96
- }
97
-
98
- return nil
99
- }
100
-
101
- // worker processes pages from the channel
102
- func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mutex *sync.Mutex, visited map[string]bool, llmEntries *[]LLMTxtEntry) {
103
- client := &http.Client{
104
- Timeout: 30 * time.Second,
105
- // Add transport with security restrictions
106
- Transport: &http.Transport{
107
- DisableKeepAlives: true,
108
- },
109
- }
110
-
111
- for page := range pagesChan {
112
- // Validate URL before fetching
113
- if err := isValidURL(page.URL); err != nil {
114
- log.Printf("Skipping invalid URL %s: %v", page.URL, err)
115
- continue
116
- }
117
-
118
- mutex.Lock()
119
- if visited[page.URL] {
120
- mutex.Unlock()
121
- continue
122
- }
123
- visited[page.URL] = true
124
- mutex.Unlock()
125
-
126
- log.Printf("Fetching: %s", page.URL)
127
-
128
- // Rate limiting - be respectful to servers
129
- time.Sleep(100 * time.Millisecond)
130
-
131
- // Fetch the page
132
- req, err := http.NewRequest("GET", page.URL, nil)
133
- if err != nil {
134
- log.Printf("Error creating request for %s: %v", page.URL, err)
135
- continue
136
- }
137
- req.Header.Set("User-Agent", config.UserAgent)
138
-
139
- resp, err := client.Do(req)
140
- if err != nil {
141
- log.Printf("Error fetching %s: %v", page.URL, err)
142
- continue
143
- }
144
- defer resp.Body.Close()
145
-
146
- if resp.StatusCode != 200 {
147
- log.Printf("Non-200 status code %d for %s", resp.StatusCode, page.URL)
148
- continue
149
- }
150
-
151
- // Parse HTML
152
- doc, err := goquery.NewDocumentFromReader(resp.Body)
153
- if err != nil {
154
- log.Printf("Error parsing HTML for %s: %v", page.URL, err)
155
- continue
156
- }
157
-
158
- // Extract title
159
- title := doc.Find("title").Text()
160
- if title == "" {
161
- title = page.URL
162
- }
163
-
164
- // Clean and extract content
165
- content := cleanContent(doc)
166
- if content == "" {
167
- log.Printf("No content found for %s", page.URL)
168
- continue
169
- }
170
-
171
- // Send result to output
172
- resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
173
-
174
- // Generate LLM.txt entry if requested
175
- if config.GenerateLLMTxt {
176
- cleanTitle := CleanTitle(title)
177
- entryType := ClassifyPage(page.URL, cleanTitle)
178
- description := ExtractDescription(content)
179
-
180
- entry := LLMTxtEntry{
181
- Type: entryType,
182
- Title: cleanTitle,
183
- URL: page.URL,
184
- Description: description,
185
- }
186
-
187
- mutex.Lock()
188
- *llmEntries = append(*llmEntries, entry)
189
- mutex.Unlock()
190
- }
191
-
192
- // Extract links for further crawling (limited depth logic would go here)
193
- // For MVP, we'll just fetch the main page
194
- // Future: implement link extraction and recursive crawling
195
- }
196
- }
197
-
198
- // cleanContent extracts and cleans the main documentation content using multiple strategies
199
- func cleanContent(doc *goquery.Document) string {
200
- // Strategy 1: Try semantic HTML5 elements (most reliable)
201
- semanticSelectors := []string{
202
- "main",
203
- "article",
204
- "[role='main']",
205
- "[role='article']",
206
- }
207
-
208
- for _, selector := range semanticSelectors {
209
- if el := doc.Find(selector); el.Length() > 0 {
210
- content := extractTextContent(el)
211
- if len(content) > 200 { // Minimum viable content
212
- return content
213
- }
214
- }
215
- }
216
-
217
- // Strategy 2: Try common class/id patterns
218
- classSelectors := []string{
219
- ".content",
220
- ".docs-content",
221
- "#main-content",
222
- ".documentation",
223
- ".post-content",
224
- ".markdown-body",
225
- ".content-wrapper",
226
- ".doc-content",
227
- ".document",
228
- ".entry-content",
229
- ".page-content",
230
- ".article-content",
231
- "[class*='content']",
232
- "[class*='docs']",
233
- "[class*='document']",
234
- "[id*='content']",
235
- "[id*='main']",
236
- }
237
-
238
- for _, selector := range classSelectors {
239
- if el := doc.Find(selector); el.Length() > 0 {
240
- content := extractTextContent(el)
241
- if len(content) > 200 {
242
- return content
243
- }
244
- }
245
- }
246
-
247
- // Strategy 3: Look for sections with high text density
248
- var bestSection *goquery.Selection
249
- maxTextLen := 0
250
-
251
- doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
252
- text := strings.TrimSpace(s.Text())
253
- if len(text) > maxTextLen {
254
- // Check if this section has more text than child elements
255
- childText := 0
256
- s.Children().Each(func(j int, c *goquery.Selection) {
257
- childText += len(strings.TrimSpace(c.Text()))
258
- })
259
-
260
- // If parent has significantly more text, it's likely the main content
261
- if len(text) > childText + (childText/2) && len(text) > 500 {
262
- maxTextLen = len(text)
263
- bestSection = s
264
- }
265
- }
266
- })
267
-
268
- if bestSection != nil {
269
- content := extractTextContent(bestSection)
270
- if len(content) > 200 {
271
- return content
272
- }
273
- }
274
-
275
- // Strategy 4: Fallback to body with aggressive cleaning
276
- body := doc.Find("body")
277
- if body.Length() > 0 {
278
- // Remove all non-content elements aggressively
279
- body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
280
-
281
- // Find the largest remaining container
282
- var largest *goquery.Selection
283
- largestSize := 0
284
-
285
- body.Find("*").Each(func(i int, s *goquery.Selection) {
286
- text := strings.TrimSpace(s.Text())
287
- if len(text) > largestSize && s.Children().Length() < 50 {
288
- largestSize = len(text)
289
- largest = s
290
- }
291
- })
292
-
293
- if largest != nil {
294
- content := extractTextContent(largest)
295
- if len(content) > 200 {
296
- return content
297
- }
298
- }
299
-
300
- // Last resort: entire body
301
- htmlContent, _ := body.Html()
302
- cleaned := cleanHTML(htmlContent)
303
- if len(cleaned) > 200 {
304
- return cleaned
305
- }
306
- }
307
-
308
- return ""
309
- }
310
-
311
- // extractTextContent extracts and cleans text from a selection
312
- func extractTextContent(sel *goquery.Selection) string {
313
- // Clone the selection to avoid modifying original
314
- clone := sel.Clone()
315
-
316
- // Remove unwanted elements
317
- clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
318
-
319
- // Get HTML and convert to clean text
320
- htmlContent, err := clone.Html()
321
- if err != nil {
322
- return ""
323
- }
324
-
325
- return cleanHTML(htmlContent)
326
- }
327
-
328
- // cleanHTML performs basic HTML cleaning
329
- func cleanHTML(htmlStr string) string {
330
- // Parse and extract text content while preserving structure
331
- doc, err := html.Parse(strings.NewReader(htmlStr))
332
- if err != nil {
333
- return ""
334
- }
335
-
336
- var texts []string
337
- var extractText func(*html.Node)
338
- extractText = func(n *html.Node) {
339
- if n.Type == html.TextNode {
340
- text := strings.TrimSpace(n.Data)
341
- if text != "" {
342
- texts = append(texts, text)
343
- }
344
- }
345
- for c := n.FirstChild; c != nil; c = c.NextSibling {
346
- extractText(c)
347
- }
348
- }
349
-
350
- extractText(doc)
351
- return strings.Join(texts, "\n")
352
- }
353
-
354
- // isValidURL validates that a URL is safe to fetch
355
- func isValidURL(urlStr string) error {
356
- parsed, err := url.Parse(urlStr)
357
- if err != nil {
358
- return fmt.Errorf("invalid URL format: %w", err)
359
- }
360
-
361
- // Only allow HTTP/HTTPS
362
- if parsed.Scheme != "http" && parsed.Scheme != "https" {
363
- return fmt.Errorf("only HTTP/HTTPS URLs allowed")
364
- }
365
-
366
- // Block private IP ranges
367
- host := parsed.Hostname()
368
- ip := net.ParseIP(host)
369
- if ip != nil {
370
- if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
371
- return fmt.Errorf("private/internal IP addresses not allowed")
372
- }
373
- }
374
-
375
- // Block dangerous hostnames
376
- dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
377
- for _, dangerous := range dangerousHosts {
378
- if host == dangerous {
379
- return fmt.Errorf("local hostnames not allowed")
380
- }
381
- }
382
-
383
- return nil
384
- }
385
-
386
- // validateConfig validates the entire configuration
387
- func validateConfig(config *Config) error {
388
- if err := validateOutputPath(config.OutputPath); err != nil {
389
- return fmt.Errorf("output path validation failed: %w", err)
390
- }
391
-
392
- if err := isValidURL(config.BaseURL); err != nil {
393
- return fmt.Errorf("base URL validation failed: %w", err)
394
- }
395
-
396
- // Limit depth to prevent excessive crawling
397
- if config.MaxDepth > 10 {
398
- return fmt.Errorf("max depth cannot exceed 10")
399
- }
400
-
401
- // Limit workers to prevent resource exhaustion
402
- if config.Workers > 20 {
403
- return fmt.Errorf("concurrent workers cannot exceed 20")
404
- }
405
-
406
- // Ensure reasonable timeout values
407
- if config.Workers <= 0 {
408
- config.Workers = 3 // Default
409
- }
410
- if config.MaxDepth <= 0 {
411
- config.MaxDepth = 2 // Default
412
- }
413
-
414
- return nil
415
- }