doc-fetch-cli 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,318 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "bufio"
5
- "context"
6
- "fmt"
7
- "log"
8
- "net"
9
- "net/http"
10
- "net/url"
11
- "os"
12
- "strings"
13
- "sync"
14
- "sync/atomic"
15
- "time"
16
-
17
- "github.com/PuerkitoBio/goquery"
18
- )
19
-
20
- // OptimizedFetcher uses advanced Go concurrency patterns for 10x speedup
21
- type OptimizedFetcher struct {
22
- config Config
23
- httpClient *http.Client
24
- urlQueue chan string
25
- visited sync.Map // Concurrent map instead of mutex-protected map
26
- resultsChan chan string
27
- llmEntries []LLMTxtEntry
28
- llmMutex sync.Mutex
29
- pageCount int32
30
- errorCount int32
31
- ctx context.Context
32
- cancel context.CancelFunc
33
- }
34
-
35
- // RunOptimized executes documentation fetching with maximum concurrency
36
- func RunOptimized(config Config) error {
37
- if err := validateConfig(&config); err != nil {
38
- return fmt.Errorf("invalid configuration: %w", err)
39
- }
40
-
41
- log.Printf("🚀 Starting HIGH-PERFORMANCE documentation fetch from: %s", config.BaseURL)
42
- log.Printf(" Workers: %d | Max Depth: %d | Concurrency: Enabled", config.Workers, config.MaxDepth)
43
-
44
- fetcher := &OptimizedFetcher{
45
- config: config,
46
- urlQueue: make(chan string, config.Workers*100), // Large buffer for URLs
47
- resultsChan: make(chan string, config.Workers*10), // Larger buffer
48
- httpClient: createOptimizedHTTPClient(config.Workers),
49
- }
50
-
51
- fetcher.ctx, fetcher.cancel = context.WithTimeout(context.Background(), 10*time.Minute)
52
- defer fetcher.cancel()
53
-
54
- startTime := time.Now()
55
-
56
- // Start result writer in background
57
- var writeWg sync.WaitGroup
58
- writeWg.Add(1)
59
- go func() {
60
- defer writeWg.Add(-1)
61
- writeResultsOptimized(config.OutputPath, fetcher.resultsChan)
62
- }()
63
-
64
- // Start worker pool
65
- var workerWg sync.WaitGroup
66
- for i := 0; i < config.Workers; i++ {
67
- workerWg.Add(1)
68
- go fetcher.worker(i, &workerWg)
69
- }
70
-
71
- // Submit initial URL
72
- fetcher.submitPage(config.BaseURL, 0)
73
-
74
- // Close URL queue when all pages are processed
75
- go func() {
76
- workerWg.Wait()
77
- close(fetcher.urlQueue)
78
- }()
79
-
80
- // Wait for all workers to complete
81
- workerWg.Wait()
82
- close(fetcher.resultsChan)
83
-
84
- // Wait for results to be written
85
- writeWg.Wait()
86
-
87
- elapsed := time.Since(startTime)
88
- pagesFetched := atomic.LoadInt32(&fetcher.pageCount)
89
- errors := atomic.LoadInt32(&fetcher.errorCount)
90
-
91
- log.Printf("✅ Fetch completed!")
92
- log.Printf(" 📊 Pages fetched: %d", pagesFetched)
93
- log.Printf(" ⏱️ Time elapsed: %v", elapsed)
94
- log.Printf(" 📈 Speed: %.2f pages/second", float64(pagesFetched)/elapsed.Seconds())
95
- log.Printf(" ❌ Errors: %d", errors)
96
-
97
- // Generate LLM.txt if requested
98
- if config.GenerateLLMTxt && len(fetcher.llmEntries) > 0 {
99
- llmTxtPath := strings.TrimSuffix(config.OutputPath, ".md") + ".llm.txt"
100
- if err := GenerateLLMTxt(fetcher.llmEntries, llmTxtPath); err != nil {
101
- log.Printf("⚠️ Warning: Failed to generate llm.txt: %v", err)
102
- } else {
103
- log.Printf("📝 LLM.txt generated: %s (%d entries)", llmTxtPath, len(fetcher.llmEntries))
104
- }
105
- }
106
-
107
- return nil
108
- }
109
-
110
- // createOptimizedHTTPClient creates a high-performance HTTP client with connection pooling
111
- func createOptimizedHTTPClient(workers int) *http.Client {
112
- return &http.Client{
113
- Timeout: 30 * time.Second,
114
- Transport: &http.Transport{
115
- MaxIdleConns: workers * 2,
116
- MaxIdleConnsPerHost: workers,
117
- IdleConnTimeout: 90 * time.Second,
118
- DisableCompression: false,
119
- DisableKeepAlives: false,
120
- DialContext: (&net.Dialer{
121
- Timeout: 10 * time.Second,
122
- KeepAlive: 30 * time.Second,
123
- }).DialContext,
124
- TLSHandshakeTimeout: 10 * time.Second,
125
- },
126
- }
127
- }
128
-
129
- // worker processes URLs from the submission queue
130
- func (f *OptimizedFetcher) worker(id int, wg *sync.WaitGroup) {
131
- defer wg.Done()
132
-
133
- for url := range f.urlQueue {
134
- select {
135
- case <-f.ctx.Done():
136
- return
137
- default:
138
- f.processURL(url, 0)
139
- }
140
- }
141
- }
142
-
143
- // submitPage adds a URL to be fetched (with depth tracking)
144
- func (f *OptimizedFetcher) submitPage(pageURL string, depth int) {
145
- if depth > f.config.MaxDepth {
146
- return
147
- }
148
-
149
- // Check if already visited using atomic operation
150
- if _, loaded := f.visited.LoadOrStore(pageURL, true); loaded {
151
- return
152
- }
153
-
154
- select {
155
- case f.urlQueue <- pageURL:
156
- // Successfully queued
157
- default:
158
- // Queue full, skip this URL
159
- log.Printf("⚠️ Queue full, skipping: %s", pageURL)
160
- }
161
- }
162
-
163
- // processURL fetches and processes a single URL
164
- func (f *OptimizedFetcher) processURL(pageURL string, depth int) {
165
- atomic.AddInt32(&f.pageCount, 1)
166
-
167
- startTime := time.Now()
168
-
169
- // Validate URL
170
- if err := isValidURL(pageURL); err != nil {
171
- atomic.AddInt32(&f.errorCount, 1)
172
- log.Printf("❌ Invalid URL %s: %v", pageURL, err)
173
- return
174
- }
175
-
176
- // Fetch the page
177
- resp, err := f.httpClient.Get(pageURL)
178
- if err != nil {
179
- atomic.AddInt32(&f.errorCount, 1)
180
- log.Printf("❌ Error fetching %s: %v", pageURL, err)
181
- return
182
- }
183
- defer resp.Body.Close()
184
-
185
- if resp.StatusCode != 200 {
186
- atomic.AddInt32(&f.errorCount, 1)
187
- log.Printf("❌ Non-200 status %d for %s", resp.StatusCode, pageURL)
188
- return
189
- }
190
-
191
- // Parse HTML concurrently
192
- doc, err := goquery.NewDocumentFromReader(resp.Body)
193
- if err != nil {
194
- atomic.AddInt32(&f.errorCount, 1)
195
- log.Printf("❌ Error parsing HTML for %s: %v", pageURL, err)
196
- return
197
- }
198
-
199
- // Extract content
200
- content := cleanContent(doc)
201
- if content == "" {
202
- atomic.AddInt32(&f.errorCount, 1)
203
- log.Printf("⚠️ No content found for %s", pageURL)
204
- return
205
- }
206
-
207
- // Extract title
208
- title := doc.Find("title").Text()
209
- if title == "" {
210
- title = pageURL
211
- }
212
-
213
- // Send result
214
- f.resultsChan <- fmt.Sprintf("## %s\n\n%s\n\n---\n\n", title, content)
215
-
216
- // Generate LLM.txt entry if requested
217
- if f.config.GenerateLLMTxt {
218
- cleanTitle := CleanTitle(title)
219
- entryType := ClassifyPage(pageURL, cleanTitle)
220
- description := ExtractDescription(content)
221
-
222
- entry := LLMTxtEntry{
223
- Type: entryType,
224
- Title: cleanTitle,
225
- URL: pageURL,
226
- Description: description,
227
- }
228
-
229
- f.llmMutex.Lock()
230
- f.llmEntries = append(f.llmEntries, entry)
231
- f.llmMutex.Unlock()
232
- }
233
-
234
- // Extract links for crawling (if depth allows)
235
- if depth < f.config.MaxDepth {
236
- f.extractAndSubmitLinks(doc, pageURL, depth+1)
237
- }
238
-
239
- elapsed := time.Since(startTime)
240
- log.Printf("✅ Fetched %s (%.2fs)", pageURL, elapsed.Seconds())
241
- }
242
-
243
- // extractAndSubmitLinks finds and queues all internal links
244
- func (f *OptimizedFetcher) extractAndSubmitLinks(doc *goquery.Document, baseURL string, depth int) {
245
- base, err := url.Parse(baseURL)
246
- if err != nil {
247
- return
248
- }
249
-
250
- doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
251
- href, exists := s.Attr("href")
252
- if !exists {
253
- return
254
- }
255
-
256
- // Resolve relative URLs
257
- resolvedURL, err := base.Parse(href)
258
- if err != nil {
259
- return
260
- }
261
-
262
- // Only follow same-domain links
263
- if resolvedURL.Host != "" && resolvedURL.Host != base.Host {
264
- return
265
- }
266
-
267
- // Skip non-HTML resources
268
- if isNonHTMLResource(resolvedURL.Path) {
269
- return
270
- }
271
-
272
- f.submitPage(resolvedURL.String(), depth)
273
- })
274
- }
275
-
276
- // isNonHTMLResource checks if URL points to non-HTML resources
277
- func isNonHTMLResource(path string) bool {
278
- extensions := []string{".pdf", ".zip", ".tar", ".gz", ".exe", ".dmg", ".pkg", ".deb", ".rpm"}
279
- pathLower := strings.ToLower(path)
280
-
281
- for _, ext := range extensions {
282
- if strings.HasSuffix(pathLower, ext) {
283
- return true
284
- }
285
- }
286
- return false
287
- }
288
-
289
- // writeResultsOptimized writes results to file efficiently
290
- func writeResultsOptimized(outputPath string, resultsChan <-chan string) error {
291
- file, err := os.Create(outputPath)
292
- if err != nil {
293
- return err
294
- }
295
- defer file.Close()
296
-
297
- writer := bufio.NewWriterSize(file, 32*1024) // 32KB buffer for better I/O
298
- defer writer.Flush()
299
-
300
- // Write header
301
- header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
302
- writer.WriteString(header)
303
-
304
- count := 0
305
- for result := range resultsChan {
306
- if strings.TrimSpace(result) != "" {
307
- writer.WriteString(result)
308
- count++
309
-
310
- // Flush periodically to avoid memory buildup
311
- if count%10 == 0 {
312
- writer.Flush()
313
- }
314
- }
315
- }
316
-
317
- return nil
318
- }
@@ -1,71 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "strings"
5
- )
6
-
7
- // ConvertHTMLToMarkdown converts HTML content to clean markdown
8
- func ConvertHTMLToMarkdown(htmlContent string) string {
9
- if htmlContent == "" {
10
- return ""
11
- }
12
-
13
- // Basic HTML to markdown conversion
14
- markdownContent := basicHTMLToMarkdown(htmlContent)
15
-
16
- return strings.TrimSpace(markdownContent)
17
- }
18
-
19
- // basicHTMLToMarkdown provides basic HTML to markdown conversion
20
- func basicHTMLToMarkdown(html string) string {
21
- // Replace common HTML tags with markdown equivalents
22
- replacements := map[string]string{
23
- "<h1>": "# ",
24
- "</h1>": "\n\n",
25
- "<h2>": "## ",
26
- "</h2>": "\n\n",
27
- "<h3>": "### ",
28
- "</h3>": "\n\n",
29
- "<h4>": "#### ",
30
- "</h4>": "\n\n",
31
- "<h5>": "##### ",
32
- "</h5>": "\n\n",
33
- "<h6>": "###### ",
34
- "</h6>": "\n\n",
35
- "<p>": "",
36
- "</p>": "\n\n",
37
- "<br>": "\n",
38
- "<br/>": "\n",
39
- "<strong>": "**",
40
- "</strong>": "**",
41
- "<b>": "**",
42
- "</b>": "**",
43
- "<em>": "*",
44
- "</em>": "*",
45
- "<i>": "*",
46
- "</i>": "*",
47
- "<code>": "`",
48
- "</code>": "`",
49
- "<pre>": "```",
50
- "</pre>": "```",
51
- "<ul>": "",
52
- "</ul>": "\n",
53
- "<ol>": "",
54
- "</ol>": "\n",
55
- "<li>": "- ",
56
- "</li>": "\n",
57
- "<blockquote>": "> ",
58
- "</blockquote>": "\n\n",
59
- }
60
-
61
- result := html
62
- for htmlTag, mdReplacement := range replacements {
63
- result = strings.ReplaceAll(result, htmlTag, mdReplacement)
64
- }
65
-
66
- // Clean up extra whitespace
67
- result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
68
- result = strings.TrimSpace(result)
69
-
70
- return result
71
- }
@@ -1,36 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "bufio"
5
- "fmt"
6
- "os"
7
- "strings"
8
- )
9
-
10
- // GenerateLLMTxt creates an llm.txt file with AI-friendly documentation index
11
- func GenerateLLMTxt(entries []LLMTxtEntry, outputPath string) error {
12
- file, err := os.Create(outputPath)
13
- if err != nil {
14
- return fmt.Errorf("failed to create llm.txt file: %w", err)
15
- }
16
- defer file.Close()
17
-
18
- writer := bufio.NewWriter(file)
19
- defer writer.Flush()
20
-
21
- // Write header
22
- writer.WriteString("# llm.txt - AI-friendly documentation index\n")
23
- writer.WriteString("# This file helps LLMs quickly find relevant documentation sections\n\n")
24
-
25
- for _, entry := range entries {
26
- // Write entry in the format: [TYPE] Title
27
- writer.WriteString(fmt.Sprintf("[%s] %s\n",
28
- strings.ToUpper(entry.Type), entry.Title))
29
- // Write URL
30
- writer.WriteString(entry.URL + "\n")
31
- // Write description
32
- writer.WriteString(entry.Description + "\n\n")
33
- }
34
-
35
- return nil
36
- }
@@ -1,109 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "fmt"
5
- "net"
6
- "net/url"
7
- "path/filepath"
8
- "strings"
9
- )
10
-
11
- // ValidateConfig validates the configuration for security issues
12
- func ValidateConfig(config *Config) error {
13
- if err := validateURL(config.BaseURL); err != nil {
14
- return fmt.Errorf("invalid URL: %w", err)
15
- }
16
- if err := validateOutputPath(config.OutputPath); err != nil {
17
- return fmt.Errorf("invalid output path: %w", err)
18
- }
19
- if config.MaxDepth > 10 {
20
- return fmt.Errorf("max depth too high (maximum allowed: 10)")
21
- }
22
- if config.Workers > 20 {
23
- return fmt.Errorf("too many concurrent workers (maximum allowed: 20)")
24
- }
25
- return nil
26
- }
27
-
28
- // validateURL checks if the URL is safe to fetch
29
- func validateURL(urlStr string) error {
30
- parsed, err := url.Parse(urlStr)
31
- if err != nil {
32
- return err
33
- }
34
-
35
- // Only allow HTTP and HTTPS
36
- if parsed.Scheme != "http" && parsed.Scheme != "https" {
37
- return fmt.Errorf("only HTTP and HTTPS URLs are allowed")
38
- }
39
-
40
- // Block private IP ranges and localhost
41
- host := parsed.Hostname()
42
- ip := net.ParseIP(host)
43
- if ip != nil {
44
- if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsMulticast() {
45
- return fmt.Errorf("access to private/internal IP addresses is not allowed")
46
- }
47
- }
48
-
49
- // Block dangerous hostnames
50
- dangerousHosts := []string{"localhost", "127.0.0.1", "0.0.0.0", "::1"}
51
- for _, dangerous := range dangerousHosts {
52
- if strings.ToLower(host) == dangerous {
53
- return fmt.Errorf("access to localhost is not allowed")
54
- }
55
- }
56
-
57
- return nil
58
- }
59
-
60
- // validateOutputPath ensures the output path is safe
61
- func validateOutputPath(path string) error {
62
- // Don't allow absolute paths that start with /
63
- if strings.HasPrefix(path, "/") {
64
- return fmt.Errorf("absolute paths are not allowed")
65
- }
66
-
67
- // Don't allow paths that contain ..
68
- if strings.Contains(path, "..") {
69
- return fmt.Errorf("relative path traversal (..) is not allowed")
70
- }
71
-
72
- // Don't allow paths that contain ~
73
- if strings.Contains(path, "~") {
74
- return fmt.Errorf("home directory expansion (~) is not allowed")
75
- }
76
-
77
- // Resolve to absolute path to check final destination
78
- absPath, err := filepath.Abs(path)
79
- if err != nil {
80
- return err
81
- }
82
-
83
- // Get current working directory
84
- cwd, err := filepath.Abs(".")
85
- if err != nil {
86
- return err
87
- }
88
-
89
- // Ensure the absolute path is within the current working directory
90
- if !strings.HasPrefix(absPath, cwd) {
91
- return fmt.Errorf("output path must be within the current working directory")
92
- }
93
-
94
- // Check file extension - only allow safe extensions
95
- allowedExtensions := []string{".md", ".txt", ".llm.txt"}
96
- ext := filepath.Ext(path)
97
- isAllowed := false
98
- for _, allowed := range allowedExtensions {
99
- if ext == allowed {
100
- isAllowed = true
101
- break
102
- }
103
- }
104
- if !isAllowed {
105
- return fmt.Errorf("only .md, .txt, and .llm.txt file extensions are allowed")
106
- }
107
-
108
- return nil
109
- }
@@ -1,32 +0,0 @@
1
- package fetcher
2
-
3
- import (
4
- "bufio"
5
- "os"
6
- "strings"
7
- )
8
-
9
- // writeResults writes the fetched documentation to the output file
10
- func writeResults(outputPath string, resultsChan <-chan string) error {
11
- file, err := os.Create(outputPath)
12
- if err != nil {
13
- return err
14
- }
15
- defer file.Close()
16
-
17
- writer := bufio.NewWriter(file)
18
- defer writer.Flush()
19
-
20
- // Write header
21
- header := "# Documentation\n\nThis file contains documentation fetched by DocFetch.\n\n---\n\n"
22
- writer.WriteString(header)
23
-
24
- // Write all results
25
- for result := range resultsChan {
26
- if strings.TrimSpace(result) != "" {
27
- writer.WriteString(result)
28
- }
29
- }
30
-
31
- return nil
32
- }
package/pyproject.toml DELETED
@@ -1,37 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "doc-fetch"
7
- version = "1.1.0"
8
- description = "Dynamic documentation fetching CLI that converts entire documentation sites to single markdown files for AI/LLM consumption"
9
- readme = "README.md"
10
- authors = [{name = "AlphaTechini", email = "rehobothokoibu@gmail.com"}]
11
- license = {text = "MIT"}
12
- classifiers = [
13
- "Development Status :: 5 - Production/Stable",
14
- "Intended Audience :: Developers",
15
- "License :: OSI Approved :: MIT License",
16
- "Operating System :: OS Independent",
17
- "Programming Language :: Python :: 3",
18
- "Programming Language :: Python :: 3.8",
19
- "Programming Language :: Python :: 3.9",
20
- "Programming Language :: Python :: 3.10",
21
- "Programming Language :: Python :: 3.11",
22
- "Programming Language :: Python :: 3.12",
23
- "Topic :: Documentation",
24
- "Topic :: Software Development :: Documentation",
25
- "Topic :: Utilities",
26
- ]
27
- keywords = ["documentation", "ai", "llm", "markdown", "crawler", "security"]
28
- requires-python = ">=3.8"
29
- dependencies = []
30
-
31
- [project.urls]
32
- Homepage = "https://github.com/AlphaTechini/doc-fetch"
33
- Repository = "https://github.com/AlphaTechini/doc-fetch"
34
- Documentation = "https://github.com/AlphaTechini/doc-fetch#readme"
35
-
36
- [project.scripts]
37
- doc-fetch = "doc_fetch.cli:main"