doc-fetch-cli 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,7 +36,8 @@ func main() {
36
36
  log.Fatalf("Configuration error: %v", err)
37
37
  }
38
38
 
39
- err := fetcher.Run(config)
39
+ // Use optimized high-performance fetcher
40
+ err := fetcher.RunOptimized(config)
40
41
  if err != nil {
41
42
  log.Fatalf("Failed to fetch documentation: %v", err)
42
43
  }
Binary file
Binary file
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-fetch
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
5
5
  Home-page: https://github.com/AlphaTechini/doc-fetch
6
6
  Author: AlphaTechini
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "doc-fetch-cli",
3
- "version": "1.0.2",
3
+ "version": "1.1.0",
4
4
  "description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
5
5
  "bin": {
6
6
  "doc-fetch": "./bin/doc-fetch.js"
@@ -195,53 +195,136 @@ func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mu
195
195
  }
196
196
  }
197
197
 
198
- // cleanContent extracts and cleans the main documentation content
198
+ // cleanContent extracts and cleans the main documentation content using multiple strategies
199
199
  func cleanContent(doc *goquery.Document) string {
200
- // Common selectors for documentation content
201
- selectors := []string{
200
+ // Strategy 1: Try semantic HTML5 elements (most reliable)
201
+ semanticSelectors := []string{
202
202
  "main",
203
203
  "article",
204
+ "[role='main']",
205
+ "[role='article']",
206
+ }
207
+
208
+ for _, selector := range semanticSelectors {
209
+ if el := doc.Find(selector); el.Length() > 0 {
210
+ content := extractTextContent(el)
211
+ if len(content) > 200 { // Minimum viable content
212
+ return content
213
+ }
214
+ }
215
+ }
216
+
217
+ // Strategy 2: Try common class/id patterns
218
+ classSelectors := []string{
204
219
  ".content",
205
- ".docs-content",
220
+ ".docs-content",
206
221
  "#main-content",
207
222
  ".documentation",
208
223
  ".post-content",
209
224
  ".markdown-body",
210
225
  ".content-wrapper",
211
226
  ".doc-content",
227
+ ".document",
228
+ ".entry-content",
229
+ ".page-content",
230
+ ".article-content",
231
+ "[class*='content']",
232
+ "[class*='docs']",
233
+ "[class*='document']",
234
+ "[id*='content']",
235
+ "[id*='main']",
212
236
  }
213
237
 
214
- // Try each selector
215
- for _, selector := range selectors {
238
+ for _, selector := range classSelectors {
216
239
  if el := doc.Find(selector); el.Length() > 0 {
217
- // Remove unwanted elements
218
- el.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
219
-
220
- // Convert to HTML and then clean
221
- htmlContent, err := el.Html()
222
- if err != nil {
223
- continue
240
+ content := extractTextContent(el)
241
+ if len(content) > 200 {
242
+ return content
224
243
  }
244
+ }
245
+ }
246
+
247
+ // Strategy 3: Look for sections with high text density
248
+ var bestSection *goquery.Selection
249
+ maxTextLen := 0
250
+
251
+ doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
252
+ text := strings.TrimSpace(s.Text())
253
+ if len(text) > maxTextLen {
254
+ // Check if this section has more text than child elements
255
+ childText := 0
256
+ s.Children().Each(func(j int, c *goquery.Selection) {
257
+ childText += len(strings.TrimSpace(c.Text()))
258
+ })
225
259
 
226
- // Basic HTML cleaning
227
- cleaned := cleanHTML(htmlContent)
228
- if cleaned != "" {
229
- return cleaned
260
+ // If parent has significantly more text, it's likely the main content
261
+ if len(text) > childText + (childText/2) && len(text) > 500 {
262
+ maxTextLen = len(text)
263
+ bestSection = s
230
264
  }
231
265
  }
266
+ })
267
+
268
+ if bestSection != nil {
269
+ content := extractTextContent(bestSection)
270
+ if len(content) > 200 {
271
+ return content
272
+ }
232
273
  }
233
274
 
234
- // Fallback: try to get body content
275
+ // Strategy 4: Fallback to body with aggressive cleaning
235
276
  body := doc.Find("body")
236
277
  if body.Length() > 0 {
237
- body.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
278
+ // Remove all non-content elements aggressively
279
+ body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
280
+
281
+ // Find the largest remaining container
282
+ var largest *goquery.Selection
283
+ largestSize := 0
284
+
285
+ body.Find("*").Each(func(i int, s *goquery.Selection) {
286
+ text := strings.TrimSpace(s.Text())
287
+ if len(text) > largestSize && s.Children().Length() < 50 {
288
+ largestSize = len(text)
289
+ largest = s
290
+ }
291
+ })
292
+
293
+ if largest != nil {
294
+ content := extractTextContent(largest)
295
+ if len(content) > 200 {
296
+ return content
297
+ }
298
+ }
299
+
300
+ // Last resort: entire body
238
301
  htmlContent, _ := body.Html()
239
- return cleanHTML(htmlContent)
302
+ cleaned := cleanHTML(htmlContent)
303
+ if len(cleaned) > 200 {
304
+ return cleaned
305
+ }
240
306
  }
241
307
 
242
308
  return ""
243
309
  }
244
310
 
311
+ // extractTextContent extracts and cleans text from a selection
312
+ func extractTextContent(sel *goquery.Selection) string {
313
+ // Clone the selection to avoid modifying original
314
+ clone := sel.Clone()
315
+
316
+ // Remove unwanted elements
317
+ clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
318
+
319
+ // Get HTML and convert to clean text
320
+ htmlContent, err := clone.Html()
321
+ if err != nil {
322
+ return ""
323
+ }
324
+
325
+ return cleanHTML(htmlContent)
326
+ }
327
+
245
328
  // cleanHTML performs basic HTML cleaning
246
329
  func cleanHTML(htmlStr string) string {
247
330
  // Parse and extract text content while preserving structure
@@ -0,0 +1,318 @@
1
+ package fetcher
2
+
3
+ import (
4
+ "bufio"
5
+ "context"
6
+ "fmt"
7
+ "log"
8
+ "net"
9
+ "net/http"
10
+ "net/url"
11
+ "os"
12
+ "strings"
13
+ "sync"
14
+ "sync/atomic"
15
+ "time"
16
+
17
+ "github.com/PuerkitoBio/goquery"
18
+ )
19
+
20
+ // OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
21
+ type OptimizedFetcher struct {
22
+ config Config
23
+ httpClient *http.Client
24
+ urlQueue chan string
25
+ visited sync.Map // Concurrent map instead of mutex-protected map
26
+ resultsChan chan string
27
+ llmEntries []LLMTxtEntry
28
+ llmMutex sync.Mutex
29
+ pageCount int32
30
+ errorCount int32
31
+ ctx context.Context
32
+ cancel context.CancelFunc
33
+ }
34
+
35
+ // RunOptimized executes documentation fetching with maximum concurrency
36
+ func RunOptimized(config Config) error {
37
+ if err := validateConfig(&config); err != nil {
38
+ return fmt.Errorf("invalid configuration: %w", err)
39
+ }
40
+
41
+ log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
42
+ log.Printf(" Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
43
+
44
+ fetcher := &OptimizedFetcher{
45
+ config: config,
46
+ urlQueue: make(chan string, config.Workers*100), // Large buffer for URLs
47
+ resultsChan: make(chan string, config.Workers*10), // Larger buffer
48
+ httpClient: createOptimizedHTTPClient(config.Workers),
49
+ }
50
+
51
+ fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
52
+ defer fetcher.cancel()
53
+
54
+ startTime := time.Now()
55
+
56
+ // Start result writer in background
57
+ var writeWg sync.WaitGroup
58
+ writeWg.Add(1)
59
+ go func() {
60
+ defer writeWg.Add(-1)
61
+ writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
62
+ }()
63
+
64
+ // Start worker pool
65
+ var workerWg sync.WaitGroup
66
+ for i := 0; i < config.Workers; i++ {
67
+ workerWg.Add(1)
68
+ go fetcher.worker(i, &workerWg)
69
+ }
70
+
71
+ // Submit initial URL
72
+ fetcher.submitPage(config.BaseURL, 0)
73
+
74
+ // Close URL queue when all pages are processed
75
+ go func() {
76
+ workerWg.Wait()
77
+ close(fetcher.urlQueue)
78
+ }()
79
+
80
+ // Wait for all workers to complete
81
+ workerWg.Wait()
82
+ close(fetcher.resultsChan)
83
+
84
+ // Wait for results to be written
85
+ writeWg.Wait()
86
+
87
+ elapsed := time.Since(startTime)
88
+ pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
89
+ errors := atomic.LoadInt32(&fetcher.errorCount)
90
+
91
+ log.Printf("✅ Fetch completed!")
92
+ log.Printf(" 📊 Pages fetched: %d", pagesFetched)
93
+ log.Printf(" ⏱️ Time elapsed: %v", elapsed)
94
+ log.Printf(" 📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
95
+ log.Printf(" ❌ Errors: %d", errors)
96
+
97
+ // Generate LLM.txt if requested
98
+ if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
99
+ llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
100
+ if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
101
+ log.Printf("⚠️ Warning: Failed to generate llm.txt: %v", err)
102
+ } else {
103
+ log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
104
+ }
105
+ }
106
+
107
+ return nil
108
+ }
109
+
110
+ // createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
111
+ func createOptimizedHTTPClient(workers int) *http.Client {
112
+ return &http.Client{
113
+ Timeout: 30 * time.Second,
114
+ Transport: &http.Transport{
115
+ MaxIdleConns: workers * 2,
116
+ MaxIdleConnsPerHost: workers,
117
+ IdleConnTimeout: 90 * time.Second,
118
+ DisableCompression: false,
119
+ DisableKeepAlives: false,
120
+ DialContext: (&net.Dialer{
121
+ Timeout: 10 * time.Second,
122
+ KeepAlive: 30 * time.Second,
123
+ }).DialContext,
124
+ TLSHandshakeTimeout: 10 * time.Second,
125
+ },
126
+ }
127
+ }
128
+
129
+ // worker processes URLs from the submission queue
130
+ func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
131
+ defer wg.Done()
132
+
133
+ for url := range f.urlQueue {
134
+ select {
135
+ case <-f.ctx.Done():
136
+ return
137
+ default:
138
+ f.processURL(url, 0)
139
+ }
140
+ }
141
+ }
142
+
143
+ // submitPage adds a URL to be fetched (with depth tracking)
144
+ func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
145
+ if depth > f.config.MaxDepth {
146
+ return
147
+ }
148
+
149
+ // Check if already visited using atomic operation
150
+ if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
151
+ return
152
+ }
153
+
154
+ select {
155
+ case f.urlQueue <- pageURL:
156
+ // Successfully queued
157
+ default:
158
+ // Queue full, skip this URL
159
+ log.Printf("⚠️ Queue full, skipping: %s", pageURL)
160
+ }
161
+ }
162
+
163
+ // processURL fetches and processes a single URL
164
+ func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
165
+ atomic.AddInt32(&f.pageCount, 1)
166
+
167
+ startTime := time.Now()
168
+
169
+ // Validate URL
170
+ if err := isValidURL(pageURL); err != nil {
171
+ atomic.AddInt32(&f.errorCount, 1)
172
+ log.Printf("❌ Invalid URL %s: %v", pageURL, err)
173
+ return
174
+ }
175
+
176
+ // Fetch the page
177
+ resp, err := f.httpClient.Get(pageURL)
178
+ if err != nil {
179
+ atomic.AddInt32(&f.errorCount, 1)
180
+ log.Printf("❌ Error fetching %s: %v", pageURL, err)
181
+ return
182
+ }
183
+ defer resp.Body.Close()
184
+
185
+ if resp.StatusCode != 200 {
186
+ atomic.AddInt32(&f.errorCount, 1)
187
+ log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
188
+ return
189
+ }
190
+
191
+ // Parse HTML concurrently
192
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
193
+ if err != nil {
194
+ atomic.AddInt32(&f.errorCount, 1)
195
+ log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
196
+ return
197
+ }
198
+
199
+ // Extract content
200
+ content := cleanContent(doc)
201
+ if content == "" {
202
+ atomic.AddInt32(&f.errorCount, 1)
203
+ log.Printf("⚠️ No content found for %s", pageURL)
204
+ return
205
+ }
206
+
207
+ // Extract title
208
+ title := doc.Find("title").Text()
209
+ if title == "" {
210
+ title = pageURL
211
+ }
212
+
213
+ // Send result
214
+ f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
215
+
216
+ // Generate LLM.txt entry if requested
217
+ if f.config.GenerateLLMTxt {
218
+ cleanTitle := CleanTitle(title)
219
+ entryType := ClassifyPage(pageURL, cleanTitle)
220
+ description := ExtractDescription(content)
221
+
222
+ entry := LLMTxtEntry{
223
+ Type: entryType,
224
+ Title: cleanTitle,
225
+ URL: pageURL,
226
+ Description: description,
227
+ }
228
+
229
+ f.llmMutex.Lock()
230
+ f.llmEntries = append(f.llmEntries, entry)
231
+ f.llmMutex.Unlock()
232
+ }
233
+
234
+ // Extract links for crawling (if depth allows)
235
+ if depth < f.config.MaxDepth {
236
+ f.extractAndSubmitLinks(doc, pageURL, depth+1)
237
+ }
238
+
239
+ elapsed := time.Since(startTime)
240
+ log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
241
+ }
242
+
243
+ // extractAndSubmitLinks finds and queues all internal links
244
+ func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
245
+ base, err := url.Parse(baseURL)
246
+ if err != nil {
247
+ return
248
+ }
249
+
250
+ doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
251
+ href, exists := s.Attr("href")
252
+ if !exists {
253
+ return
254
+ }
255
+
256
+ // Resolve relative URLs
257
+ resolvedURL, err := base.Parse(href)
258
+ if err != nil {
259
+ return
260
+ }
261
+
262
+ // Only follow same-domain links
263
+ if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
264
+ return
265
+ }
266
+
267
+ // Skip non-HTML resources
268
+ if isNonHTMLResource(resolvedURL.Path) {
269
+ return
270
+ }
271
+
272
+ f.submitPage(resolvedURL.String(), depth)
273
+ })
274
+ }
275
+
276
+ // isNonHTMLResource checks if URL points to non-HTML resources
277
+ func isNonHTMLResource(path string) bool {
278
+ extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
279
+ pathLower := strings.ToLower(path)
280
+
281
+ for _, ext := range extensions {
282
+ if strings.HasSuffix(pathLower, ext) {
283
+ return true
284
+ }
285
+ }
286
+ return false
287
+ }
288
+
289
+ // writeResultsOptimized writes results to file efficiently
290
+ func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
291
+ file, err := os.Create(outputPath)
292
+ if err != nil {
293
+ return err
294
+ }
295
+ defer file.Close()
296
+
297
+ writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
298
+ defer writer.Flush()
299
+
300
+ // Write header
301
+ header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
302
+ writer.WriteString(header)
303
+
304
+ count := 0
305
+ for result := range resultsChan {
306
+ if strings.TrimSpace(result) != "" {
307
+ writer.WriteString(result)
308
+ count++
309
+
310
+ // Flush periodically to avoid memory buildup
311
+ if count%10 == 0 {
312
+ writer.Flush()
313
+ }
314
+ }
315
+ }
316
+
317
+ return nil
318
+ }
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "doc-fetch"
7
- version = "1.0.1"
7
+ version = "1.1.0"
8
8
  description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
9
9
  readme = "README.md"
10
10
  authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
package/setup.py CHANGED
@@ -118,7 +118,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
118
118
 
119
119
  setup(
120
120
  name="doc-fetch",
121
- version="1.0.1",
121
+ version="1.1.0",
122
122
  author="AlphaTechini",
123
123
  author_email="rehobothokoibu@gmail.com",
124
124
  description="Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
Binary file