npm - doc-fetch-cli - Versions diffs - 1.0.2 → 1.1.0 - Mend

doc-fetch-cli 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/cmd/docfetch/main.go +2 -1
package/dist/{doc_fetch-1.0.1-py3-none-any.whl → doc_fetch-1.1.0-py3-none-any.whl} +0 -0
package/dist/doc_fetch-1.1.0.tar.gz +0 -0
package/doc-fetch_darwin_amd64 +0 -0
package/doc-fetch_linux_amd64 +0 -0
package/doc-fetch_windows_amd64.exe +0 -0
package/doc_fetch.egg-info/PKG-INFO +1 -1
package/package.json +1 -1
package/pkg/fetcher/fetcher.go +103 -20
package/pkg/fetcher/fetcher_optimized.go +318 -0
package/pyproject.toml +1 -1
package/setup.py +1 -1
package/dist/doc_fetch-1.0.1.tar.gz +0 -0

package/cmd/docfetch/main.go CHANGED Viewed

@@ -36,7 +36,8 @@ func main() {
 		log.Fatalf("Configuration error: %v", err)
 	}
-	err := fetcher.Run(config)
+	// Use optimized high-performance fetcher
+	err := fetcher.RunOptimized(config)
 	if err != nil {
 		log.Fatalf("Failed to fetch documentation: %v", err)
 	}

package/dist/{doc_fetch-1.0.1-py3-none-any.whl → doc_fetch-1.1.0-py3-none-any.whl} RENAMED Viewed

Binary file

package/dist/doc_fetch-1.1.0.tar.gz ADDED Viewed

Binary file

package/doc-fetch_darwin_amd64 CHANGED Viewed

Binary file

package/doc-fetch_linux_amd64 CHANGED Viewed

Binary file

package/doc-fetch_windows_amd64.exe CHANGED Viewed

Binary file

package/doc_fetch.egg-info/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: doc-fetch
-Version: 1.0.1
+Version: 1.1.0
 Summary: Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption
 Home-page: https://github.com/AlphaTechini/doc-fetch
 Author: AlphaTechini

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "doc-fetch-cli",
-  "version": "1.0.2",
+  "version": "1.1.0",
   "description": "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",
   "bin": {
     "doc-fetch": "./bin/doc-fetch.js"

package/pkg/fetcher/fetcher.go CHANGED Viewed

@@ -195,53 +195,136 @@ func worker(config Config, pagesChan <-chan *Page, resultsChan chan<- string, mu
 	}
 }
-// cleanContent extracts and cleans the main documentation content
+// cleanContent extracts and cleans the main documentation content using multiple strategies
 func cleanContent(doc *goquery.Document) string {
-	// Common selectors for documentation content
-	selectors := []string{
+	// Strategy 1: Try semantic HTML5 elements (most reliable)
+	semanticSelectors := []string{
 		"main",
 		"article",
+		"[role='main']",
+		"[role='article']",
+	}
+	for _, selector := range semanticSelectors {
+		if el := doc.Find(selector); el.Length() > 0 {
+			content := extractTextContent(el)
+			if len(content) > 200 { // Minimum viable content
+				return content
+			}
+		}
+	}
+	// Strategy 2: Try common class/id patterns
+	classSelectors := []string{
 		".content",
-		".docs-content",
+		".docs-content",
 		"#main-content",
 		".documentation",
 		".post-content",
 		".markdown-body",
 		".content-wrapper",
 		".doc-content",
+		".document",
+		".entry-content",
+		".page-content",
+		".article-content",
+		"[class*='content']",
+		"[class*='docs']",
+		"[class*='document']",
+		"[id*='content']",
+		"[id*='main']",
 	}
-	// Try each selector
-	for _, selector := range selectors {
+	for _, selector := range classSelectors {
 		if el := doc.Find(selector); el.Length() > 0 {
-			// Remove unwanted elements
-			el.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
-			// Convert to HTML and then clean
-			htmlContent, err := el.Html()
-			if err != nil {
-				continue
+			content := extractTextContent(el)
+			if len(content) > 200 {
+				return content
 			}
+		}
+	}
+	// Strategy 3: Look for sections with high text density
+	var bestSection *goquery.Selection
+	maxTextLen := 0
+	doc.Find("section, div").Each(func(i int, s *goquery.Selection) {
+		text := strings.TrimSpace(s.Text())
+		if len(text) > maxTextLen {
+			// Check if this section has more text than child elements
+			childText := 0
+			s.Children().Each(func(j int, c *goquery.Selection) {
+				childText += len(strings.TrimSpace(c.Text()))
+			})
-			// Basic HTML cleaning
-			cleaned := cleanHTML(htmlContent)
-			if cleaned != "" {
-				return cleaned
+			// If parent has significantly more text, it's likely the main content
+			if len(text) > childText + (childText/2) && len(text) > 500 {
+				maxTextLen = len(text)
+				bestSection = s
 			}
 		}
+	})
+	if bestSection != nil {
+		content := extractTextContent(bestSection)
+		if len(content) > 200 {
+			return content
+		}
 	}
-	// Fallback: try to get body content
+	// Strategy 4: Fallback to body with aggressive cleaning
 	body := doc.Find("body")
 	if body.Length() > 0 {
-		body.Find("nav, header, footer, .sidebar, .toc, .navigation, script, style, .ad, .advertisement").Remove()
+		// Remove all non-content elements aggressively
+		body.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header']").Remove()
+		// Find the largest remaining container
+		var largest *goquery.Selection
+		largestSize := 0
+		body.Find("*").Each(func(i int, s *goquery.Selection) {
+			text := strings.TrimSpace(s.Text())
+			if len(text) > largestSize && s.Children().Length() < 50 {
+				largestSize = len(text)
+				largest = s
+			}
+		})
+		if largest != nil {
+			content := extractTextContent(largest)
+			if len(content) > 200 {
+				return content
+			}
+		}
+		// Last resort: entire body
 		htmlContent, _ := body.Html()
-		return cleanHTML(htmlContent)
+		cleaned := cleanHTML(htmlContent)
+		if len(cleaned) > 200 {
+			return cleaned
+		}
 	}
 	return ""
 }
+// extractTextContent extracts and cleans text from a selection
+func extractTextContent(sel *goquery.Selection) string {
+	// Clone the selection to avoid modifying original
+	clone := sel.Clone()
+	// Remove unwanted elements
+	clone.Find("nav, header, footer, aside, script, style, form, iframe, .sidebar, .toc, .navigation, .menu, .ads, .advertisement, button, [class*='nav'], [class*='menu'], [class*='sidebar'], [class*='footer'], [class*='header'], [class*='button'], [onclick], [role='navigation'], [role='banner'], [role='contentinfo']").Remove()
+	// Get HTML and convert to clean text
+	htmlContent, err := clone.Html()
+	if err != nil {
+		return ""
+	}
+	return cleanHTML(htmlContent)
+}
 // cleanHTML performs basic HTML cleaning
 func cleanHTML(htmlStr string) string {
 	// Parse and extract text content while preserving structure

package/pkg/fetcher/fetcher_optimized.go ADDED Viewed

@@ -0,0 +1,318 @@
+package fetcher
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"log"
+	"net"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"github.com/PuerkitoBio/goquery"
+)
+// OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
+type OptimizedFetcher struct {
+	config        Config
+	httpClient    *http.Client
+	urlQueue      chan string
+	visited       sync.Map // Concurrent map instead of mutex-protected map
+	resultsChan   chan string
+	llmEntries    []LLMTxtEntry
+	llmMutex      sync.Mutex
+	pageCount     int32
+	errorCount    int32
+	ctx           context.Context
+	cancel        context.CancelFunc
+}
+// RunOptimized executes documentation fetching with maximum concurrency
+func RunOptimized(config Config) error {
+	if err := validateConfig(&config); err != nil {
+		return fmt.Errorf("invalid configuration: %w", err)
+	}
+	log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
+	log.Printf("   Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
+	fetcher := &OptimizedFetcher{
+		config:      config,
+		urlQueue:    make(chan string, config.Workers*100), // Large buffer for URLs
+		resultsChan: make(chan string, config.Workers*10), // Larger buffer
+		httpClient: createOptimizedHTTPClient(config.Workers),
+	}
+	fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
+	defer fetcher.cancel()
+	startTime := time.Now()
+	// Start result writer in background
+	var writeWg sync.WaitGroup
+	writeWg.Add(1)
+	go func() {
+		defer writeWg.Add(-1)
+		writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
+	}()
+	// Start worker pool
+	var workerWg sync.WaitGroup
+	for i := 0; i < config.Workers; i++ {
+		workerWg.Add(1)
+		go fetcher.worker(i, &workerWg)
+	}
+	// Submit initial URL
+	fetcher.submitPage(config.BaseURL, 0)
+	// Close URL queue when all pages are processed
+	go func() {
+		workerWg.Wait()
+		close(fetcher.urlQueue)
+	}()
+	// Wait for all workers to complete
+	workerWg.Wait()
+	close(fetcher.resultsChan)
+	// Wait for results to be written
+	writeWg.Wait()
+	elapsed := time.Since(startTime)
+	pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
+	errors := atomic.LoadInt32(&fetcher.errorCount)
+	log.Printf("✅ Fetch completed!")
+	log.Printf("   📊 Pages fetched: %d", pagesFetched)
+	log.Printf("   ⏱️  Time elapsed: %v", elapsed)
+	log.Printf("   📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
+	log.Printf("   ❌ Errors: %d", errors)
+	// Generate LLM.txt if requested
+	if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
+		llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
+		if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
+			log.Printf("⚠️  Warning: Failed to generate llm.txt: %v", err)
+		} else {
+			log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
+		}
+	}
+	return nil
+}
+// createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
+func createOptimizedHTTPClient(workers int) *http.Client {
+	return &http.Client{
+		Timeout: 30 * time.Second,
+		Transport: &http.Transport{
+			MaxIdleConns:        workers * 2,
+			MaxIdleConnsPerHost: workers,
+			IdleConnTimeout:     90 * time.Second,
+			DisableCompression:  false,
+			DisableKeepAlives:   false,
+			DialContext: (&net.Dialer{
+				Timeout:   10 * time.Second,
+				KeepAlive: 30 * time.Second,
+			}).DialContext,
+			TLSHandshakeTimeout: 10 * time.Second,
+		},
+	}
+}
+// worker processes URLs from the submission queue
+func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
+	defer wg.Done()
+	for url := range f.urlQueue {
+		select {
+		case <-f.ctx.Done():
+			return
+		default:
+			f.processURL(url, 0)
+		}
+	}
+}
+// submitPage adds a URL to be fetched (with depth tracking)
+func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
+	if depth > f.config.MaxDepth {
+		return
+	}
+	// Check if already visited using atomic operation
+	if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
+		return
+	}
+	select {
+	case f.urlQueue <- pageURL:
+		// Successfully queued
+	default:
+		// Queue full, skip this URL
+		log.Printf("⚠️  Queue full, skipping: %s", pageURL)
+	}
+}
+// processURL fetches and processes a single URL
+func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
+	atomic.AddInt32(&f.pageCount, 1)
+	startTime := time.Now()
+	// Validate URL
+	if err := isValidURL(pageURL); err != nil {
+		atomic.AddInt32(&f.errorCount, 1)
+		log.Printf("❌ Invalid URL %s: %v", pageURL, err)
+		return
+	}
+	// Fetch the page
+	resp, err := f.httpClient.Get(pageURL)
+	if err != nil {
+		atomic.AddInt32(&f.errorCount, 1)
+		log.Printf("❌ Error fetching %s: %v", pageURL, err)
+		return
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != 200 {
+		atomic.AddInt32(&f.errorCount, 1)
+		log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
+		return
+	}
+	// Parse HTML concurrently
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		atomic.AddInt32(&f.errorCount, 1)
+		log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
+		return
+	}
+	// Extract content
+	content := cleanContent(doc)
+	if content == "" {
+		atomic.AddInt32(&f.errorCount, 1)
+		log.Printf("⚠️  No content found for %s", pageURL)
+		return
+	}
+	// Extract title
+	title := doc.Find("title").Text()
+	if title == "" {
+		title = pageURL
+	}
+	// Send result
+	f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
+	// Generate LLM.txt entry if requested
+	if f.config.GenerateLLMTxt {
+		cleanTitle := CleanTitle(title)
+		entryType := ClassifyPage(pageURL, cleanTitle)
+		description := ExtractDescription(content)
+		entry := LLMTxtEntry{
+			Type:        entryType,
+			Title:       cleanTitle,
+			URL:         pageURL,
+			Description: description,
+		}
+		f.llmMutex.Lock()
+		f.llmEntries = append(f.llmEntries, entry)
+		f.llmMutex.Unlock()
+	}
+	// Extract links for crawling (if depth allows)
+	if depth < f.config.MaxDepth {
+		f.extractAndSubmitLinks(doc, pageURL, depth+1)
+	}
+	elapsed := time.Since(startTime)
+	log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
+}
+// extractAndSubmitLinks finds and queues all internal links
+func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
+	base, err := url.Parse(baseURL)
+	if err != nil {
+		return
+	}
+	doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists {
+			return
+		}
+		// Resolve relative URLs
+		resolvedURL, err := base.Parse(href)
+		if err != nil {
+			return
+		}
+		// Only follow same-domain links
+		if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
+			return
+		}
+		// Skip non-HTML resources
+		if isNonHTMLResource(resolvedURL.Path) {
+			return
+		}
+		f.submitPage(resolvedURL.String(), depth)
+	})
+}
+// isNonHTMLResource checks if URL points to non-HTML resources
+func isNonHTMLResource(path string) bool {
+	extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
+	pathLower := strings.ToLower(path)
+	for _, ext := range extensions {
+		if strings.HasSuffix(pathLower, ext) {
+			return true
+		}
+	}
+	return false
+}
+// writeResultsOptimized writes results to file efficiently
+func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
+	file, err := os.Create(outputPath)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
+	defer writer.Flush()
+	// Write header
+	header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
+	writer.WriteString(header)
+	count := 0
+	for result := range resultsChan {
+		if strings.TrimSpace(result) != "" {
+			writer.WriteString(result)
+			count++
+			// Flush periodically to avoid memory buildup
+			if count%10 == 0 {
+				writer.Flush()
+			}
+		}
+	}
+	return nil
+}

package/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc-fetch"
-version = "1.0.1"
+version = "1.1.0"
 description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
 readme = "README.md"
 authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]

package/setup.py CHANGED Viewed

@@ -118,7 +118,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="doc-fetch",
-    version="1.0.1",
+    version="1.1.0",
     author="AlphaTechini",
     author_email="rehobothokoibu@gmail.com",
     description="Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption",

package/dist/doc_fetch-1.0.1.tar.gz DELETED Viewed

Binary file