PyPI - docs-crawler - Versions diffs - 0.1.0__py3-none-any.whl - Mend

docs-crawler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

docs_crawler/__init__.py +7 -0
docs_crawler/cli.py +241 -0
docs_crawler/crawler.py +401 -0
docs_crawler-0.1.0.dist-info/METADATA +262 -0
docs_crawler-0.1.0.dist-info/RECORD +8 -0
docs_crawler-0.1.0.dist-info/WHEEL +4 -0
docs_crawler-0.1.0.dist-info/entry_points.txt +3 -0
docs_crawler-0.1.0.dist-info/licenses/LICENSE +21 -0

docs_crawler/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""A documentation crawler that converts web documentation to Markdown format."""
+__version__ = "0.1.0"
+from docs_crawler.crawler import Crawler
+__all__ = ["Crawler"]

docs_crawler/cli.py ADDED Viewed

@@ -0,0 +1,241 @@
+import os
+import sys
+import argparse
+import logging
+from urllib.parse import urlparse
+from docs_crawler.crawler import Crawler
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def extract_subdomain(url):
+    """Extract subdomain from URL for file naming."""
+    parsed = urlparse(url)
+    hostname = parsed.hostname
+    if hostname:
+        parts = hostname.split('.')
+        if len(parts) >= 2:
+            return parts[-2]
+        elif len(parts) == 1:
+            return parts[0]
+    return 'default'
+def main():
+    """Main CLI entry point for docs-crawler."""
+    parser = argparse.ArgumentParser(
+        description="Crawl and convert documentation to Markdown.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Crawl from sitemap (tries sitemap first, falls back to recursive discovery)
+  docs-crawler --base-url https://example.com
+  # Discover links only and save to file
+  docs-crawler --mode discover --base-url https://example.com
+  # Crawl from a list of URLs in a file
+  docs-crawler --mode list --file urls.txt
+  # Specify custom output folder
+  docs-crawler --base-url https://example.com --folder my-docs
+        """
+    )
+    parser.add_argument(
+        '--mode',
+        choices=['sitemap', 'discover', 'list'],
+        default='sitemap',
+        help="Mode: 'sitemap' (crawl, tries sitemap then recursive), 'discover' (find and save URLs), or 'list' (crawl from file)."
+    )
+    parser.add_argument(
+        '--base-url',
+        help="Base URL of the documentation site (e.g., https://example.com)"
+    )
+    parser.add_argument(
+        '--start-url',
+        help="Starting URL for recursive discovery (e.g., https://example.com/docs/)"
+    )
+    parser.add_argument(
+        '--sitemap-url',
+        help="URL of the sitemap (overrides auto-detected sitemap URL)"
+    )
+    parser.add_argument(
+        '--file',
+        help="Path to the text file containing URLs (required if mode is 'list')."
+    )
+    parser.add_argument(
+        '--output-file',
+        help="Output file for discovered URLs (used in discover mode, auto-generated if not specified)"
+    )
+    parser.add_argument(
+        '--folder',
+        help="Custom folder name under output directory (overrides auto-detection from domain)."
+    )
+    parser.add_argument(
+        '--output-dir',
+        default='output',
+        help="Output directory for markdown files (default: output)"
+    )
+    parser.add_argument(
+        '--path-filter',
+        default='/docs/',
+        help="Path pattern to filter links (default: /docs/)"
+    )
+    parser.add_argument(
+        '--max-depth',
+        type=int,
+        default=100,
+        help="Maximum number of URLs to discover in recursive mode (default: 100)"
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    urls = None
+    if args.mode == 'discover':
+        # Discover mode: find links and save to file
+        if not args.base_url and not args.start_url:
+            parser.error("--base-url or --start-url is required when mode is 'discover'")
+        crawler = Crawler(
+            base_url=args.base_url,
+            sitemap_url=args.sitemap_url,
+            output_dir=args.output_dir,
+            custom_folder=args.folder
+        )
+        try:
+            # Discover links
+            discovered_urls = crawler.discover_links(
+                start_url=args.start_url,
+                path_filter=args.path_filter,
+                max_depth=args.max_depth
+            )
+            if not discovered_urls:
+                logger.warning("No URLs discovered.")
+                sys.exit(0)
+            # Generate output filename
+            if args.output_file:
+                output_file = args.output_file
+            else:
+                # Use subdomain-based naming
+                base = args.base_url or args.start_url
+                subdomain = extract_subdomain(base)
+                output_file = f"{subdomain}_urls.txt"
+            # Show discovered URLs and ask for confirmation
+            logger.info(f"\nDiscovered {len(discovered_urls)} URLs:")
+            print("\nFirst 10 URLs:")
+            for url in discovered_urls[:10]:
+                print(f"  - {url}")
+            if len(discovered_urls) > 10:
+                print(f"  ... and {len(discovered_urls) - 10} more")
+            # Ask for confirmation
+            print(f"\nSave URLs to '{output_file}'? [Y/n]: ", end='', flush=True)
+            response = input().strip().lower()
+            if response in ['', 'y', 'yes']:
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    for url in discovered_urls:
+                        f.write(f"{url}\n")
+                logger.info(f"Saved {len(discovered_urls)} URLs to {output_file}")
+                logger.info(f"You can now run: docs-crawler --mode list --file {output_file}")
+            else:
+                logger.info("Cancelled. URLs not saved.")
+        except KeyboardInterrupt:
+            logger.info("\nDiscovery interrupted by user.")
+            sys.exit(0)
+        except Exception as e:
+            logger.error(f"Error during discovery: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+    elif args.mode == 'sitemap':
+        # Sitemap mode (with fallback to recursive discovery)
+        if not args.base_url and not args.sitemap_url:
+            parser.error("--base-url or --sitemap-url is required when mode is 'sitemap'")
+        crawler = Crawler(
+            base_url=args.base_url,
+            sitemap_url=args.sitemap_url,
+            output_dir=args.output_dir,
+            custom_folder=args.folder
+        )
+        try:
+            crawler.run(
+                urls=None,
+                start_url=args.start_url,
+                path_filter=args.path_filter,
+                max_depth=args.max_depth
+            )
+        except KeyboardInterrupt:
+            logger.info("\nCrawling interrupted by user.")
+            sys.exit(0)
+        except Exception as e:
+            logger.error(f"Error during crawling: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+    elif args.mode == 'list':
+        # List mode: crawl from file
+        if not args.file:
+            parser.error("--file is required when mode is 'list'")
+        if not os.path.exists(args.file):
+            logger.error(f"File not found: {args.file}")
+            sys.exit(1)
+        try:
+            with open(args.file, 'r', encoding='utf-8') as f:
+                urls = [line.strip() for line in f if line.strip()]
+            logger.info(f"Loaded {len(urls)} URLs from {args.file}")
+        except Exception as e:
+            logger.error(f"Failed to read file {args.file}: {e}")
+            sys.exit(1)
+        # Determine base_url from first URL if not provided
+        if not args.base_url and urls:
+            parsed = urlparse(urls[0])
+            args.base_url = f"{parsed.scheme}://{parsed.netloc}"
+        crawler = Crawler(
+            base_url=args.base_url,
+            sitemap_url=args.sitemap_url,
+            output_dir=args.output_dir,
+            custom_folder=args.folder
+        )
+        try:
+            crawler.run(urls=urls)
+        except KeyboardInterrupt:
+            logger.info("\nCrawling interrupted by user.")
+            sys.exit(0)
+        except Exception as e:
+            logger.error(f"Error during crawling: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+if __name__ == "__main__":
+    main()

docs_crawler/crawler.py ADDED Viewed

@@ -0,0 +1,401 @@
+import os
+import logging
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+from playwright.sync_api import sync_playwright
+from tqdm import tqdm
+import requests
+# Configuration
+MAX_RETRIES = 3
+PAGE_LOAD_TIMEOUT = 30000  # 30秒超时
+MAX_DISCOVERY_DEPTH = 10  # 最大递归深度
+# Setup logging
+logger = logging.getLogger(__name__)
+class Crawler:
+    def __init__(self, base_url=None, sitemap_url=None, output_dir="output", custom_folder=None):
+        """
+        Initialize the crawler.
+        Args:
+            base_url: Base URL of the documentation site
+            sitemap_url: URL of the sitemap
+            output_dir: Output directory for markdown files
+            custom_folder: Custom folder name under output_dir
+        """
+        self.base_url = base_url
+        self.sitemap_url = sitemap_url or (f"{base_url}/sitemap.xml" if base_url else None)
+        self.output_dir = output_dir
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0; +http://example.com)'
+        })
+        self.results = []
+        self.subdomain = None
+        self.custom_folder = custom_folder
+        # Ensure output directory exists
+        os.makedirs(output_dir, exist_ok=True)
+    def extract_subdomain(self, url):
+        """从URL中提取二级域名（主域名）作为文件夹名。"""
+        parsed = urlparse(url)
+        hostname = parsed.hostname
+        if hostname:
+            parts = hostname.split('.')
+            # 提取二级域名（主域名）的逻辑：
+            # code.claude.com -> parts[-2] = 'claude'
+            # antigravity.google -> parts[-2] = 'antigravity'
+            # example.com -> parts[-2] = 'example'
+            # localhost -> parts[-1] = 'localhost'
+            if len(parts) >= 2:
+                # 取倒数第二个部分作为二级域名
+                return parts[-2]
+            elif len(parts) == 1:
+                # 只有一个部分，如 localhost
+                return parts[0]
+        return 'default'
+    def fetch_sitemap(self):
+        """Fetches and parses the sitemap to extract /docs/ URLs."""
+        if not self.sitemap_url:
+            logger.error("No sitemap URL configured")
+            return []
+        try:
+            logger.info(f"Fetching sitemap from {self.sitemap_url}")
+            response = self.session.get(self.sitemap_url)
+            response.raise_for_status()
+            # XML parsing (using lxml if available, else html.parser)
+            # sitemap files are often just text/xml
+            soup = BeautifulSoup(response.content, 'xml')
+            urls = [loc.text for loc in soup.find_all('loc')]
+            # Filter for /docs/
+            doc_urls = [url for url in urls if '/docs/' in urlparse(url).path]
+            logger.info(f"Found {len(doc_urls)} pages under /docs/")
+            return doc_urls
+        except Exception as e:
+            logger.error(f"Failed to fetch sitemap: {e}")
+            return []
+    def extract_links_from_page(self, page, current_url, path_filter='/docs/'):
+        """
+        Extract all links from a page that match the path filter.
+        Args:
+            page: Playwright page object
+            current_url: Current page URL
+            path_filter: Path pattern to filter links (default: '/docs/')
+        Returns:
+            Set of discovered URLs
+        """
+        links = set()
+        try:
+            # Get all <a> tags
+            link_elements = page.query_selector_all('a[href]')
+            parsed_base = urlparse(current_url)
+            base_domain = parsed_base.netloc
+            for element in link_elements:
+                href = element.get_attribute('href')
+                if not href:
+                    continue
+                # Convert relative URLs to absolute
+                absolute_url = urljoin(current_url, href)
+                parsed_url = urlparse(absolute_url)
+                # Filter: same domain and contains path_filter
+                if (parsed_url.netloc == base_domain and
+                    path_filter in parsed_url.path):
+                    # Remove fragment and normalize
+                    clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+                    if parsed_url.query:
+                        clean_url += f"?{parsed_url.query}"
+                    links.add(clean_url)
+        except Exception as e:
+            logger.warning(f"Error extracting links from {current_url}: {e}")
+        return links
+    def discover_links_recursive(self, start_url, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
+        """
+        Recursively discover documentation links starting from a URL.
+        Args:
+            start_url: Starting URL for discovery
+            path_filter: Path pattern to filter links (default: '/docs/')
+            max_depth: Maximum number of URLs to discover
+        Returns:
+            List of discovered URLs
+        """
+        discovered = set()
+        to_visit = {start_url}
+        visited = set()
+        logger.info(f"Starting recursive link discovery from {start_url}")
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            context = browser.new_context(
+                user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+            )
+            page = context.new_page()
+            pbar = tqdm(desc="Discovering links", unit="page")
+            while to_visit and len(discovered) < max_depth:
+                current_url = to_visit.pop()
+                if current_url in visited:
+                    continue
+                visited.add(current_url)
+                discovered.add(current_url)
+                pbar.update(1)
+                pbar.set_postfix({"found": len(discovered), "queue": len(to_visit)})
+                try:
+                    # Load the page
+                    page.goto(current_url, timeout=PAGE_LOAD_TIMEOUT)
+                    page.wait_for_load_state('networkidle', timeout=15000)
+                    # Extract links from this page
+                    new_links = self.extract_links_from_page(page, current_url, path_filter)
+                    # Add new unvisited links to the queue
+                    for link in new_links:
+                        if link not in visited and link not in discovered:
+                            to_visit.add(link)
+                except Exception as e:
+                    logger.warning(f"Failed to process {current_url}: {e}")
+            pbar.close()
+            browser.close()
+        logger.info(f"Discovery complete. Found {len(discovered)} URLs")
+        return sorted(list(discovered))
+    def discover_links(self, start_url=None, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
+        """
+        Discover documentation links. Try sitemap first, fallback to recursive discovery.
+        Args:
+            start_url: Starting URL for recursive discovery (if sitemap fails)
+            path_filter: Path pattern to filter links (default: '/docs/')
+            max_depth: Maximum number of URLs to discover in recursive mode
+        Returns:
+            List of discovered URLs
+        """
+        # Try sitemap first
+        urls = self.fetch_sitemap()
+        if urls:
+            logger.info(f"Successfully found {len(urls)} URLs from sitemap")
+            return urls
+        # Fallback to recursive discovery
+        logger.info("Sitemap not available, using recursive link discovery")
+        if not start_url:
+            # Try to construct a starting URL
+            if self.base_url:
+                start_url = f"{self.base_url}/docs/" if not self.base_url.endswith('/') else f"{self.base_url}docs/"
+            else:
+                logger.error("No start URL provided and no base_url configured")
+                return []
+        return self.discover_links_recursive(start_url, path_filter, max_depth)
+    def process_url_with_playwright(self, page, url):
+        """Downloads and converts a single URL using Playwright."""
+        # 如果还没有设置subdomain，从当前URL提取或使用custom_folder
+        if self.subdomain is None:
+            if self.custom_folder:
+                self.subdomain = self.custom_folder
+                logger.info(f"Using custom folder: {self.subdomain}")
+            else:
+                self.subdomain = self.extract_subdomain(url)
+                logger.info(f"Using auto-detected folder (domain): {self.subdomain}")
+            # 创建子文件夹
+            self.output_subdir = os.path.join(self.output_dir, self.subdomain)
+            os.makedirs(self.output_subdir, exist_ok=True)
+        slug = urlparse(url).path.strip('/').replace('/', '_')
+        if not slug:
+            slug = "index"
+        filename = f"{slug}.md"
+        filepath = os.path.join(self.output_subdir, filename)
+        content = None
+        title = None
+        for attempt in range(MAX_RETRIES):
+            try:
+                # 导航到页面
+                page.goto(url, timeout=PAGE_LOAD_TIMEOUT)
+                # 等待主内容加载完成
+                # 尝试等待文章内容或主区域
+                try:
+                    page.wait_for_selector('article, main, [role="main"]', timeout=10000)
+                except:
+                    pass
+                # 额外等待确保JS完全渲染
+                page.wait_for_load_state('networkidle', timeout=15000)
+                # 获取渲染后的HTML
+                content = page.content()
+                break
+            except Exception as e:
+                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
+                if attempt == MAX_RETRIES - 1:
+                    logger.error(f"Failed to download {url} after {MAX_RETRIES} attempts")
+                    return None
+        if content:
+            try:
+                markdown_content, page_title = self.convert_to_markdown(content)
+                title = page_title
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    f.write(markdown_content)
+                return {'title': title, 'url': url, 'file': filename}
+            except Exception as e:
+                logger.error(f"Error converting {url}: {e}")
+        return None
+    def convert_to_markdown(self, html_content):
+        """Extracts content and converts to Markdown."""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract title
+        title_tag = soup.find('title')
+        title = title_tag.text.strip() if title_tag else "No Title"
+        # Remove unwanted elements
+        for tag in soup.find_all(['nav', 'footer', 'script', 'style', 'noscript', 'iframe', 'header']):
+            tag.decompose()
+        # Common classes/IDs for unwanted elements
+        unwanted_selectors = [
+            '.sidebar', '#sidebar',
+            '.toc', '#toc',
+            '.breadcrumbs', '.breadcrumb',
+            '.footer', '.header', '.nav',
+            '[role="navigation"]',
+            '.navigation',
+            '.menu'
+        ]
+        for selector in unwanted_selectors:
+            for element in soup.select(selector):
+                element.decompose()
+        # Prioritize content extraction - 尝试更具体的选择器
+        content_element = None
+        # 尝试找到文档内容区域
+        content_selectors = [
+            'article',
+            '[role="main"]',
+            '.docs-content',
+            '.content',
+            '.markdown-body',
+            'main',
+            '.main-content'
+        ]
+        for selector in content_selectors:
+            content_element = soup.select_one(selector)
+            if content_element and len(content_element.get_text(strip=True)) > 100:
+                break
+        if not content_element:
+            content_element = soup.find('body')
+        if not content_element:
+            return "", title
+        # Convert to Markdown
+        markdown = md(str(content_element), heading_style="ATX", strip=['img'])
+        # 清理多余的空行
+        lines = markdown.split('\n')
+        cleaned_lines = []
+        prev_empty = False
+        for line in lines:
+            is_empty = not line.strip()
+            if is_empty and prev_empty:
+                continue
+            cleaned_lines.append(line)
+            prev_empty = is_empty
+        return '\n'.join(cleaned_lines).strip(), title
+    def generate_index(self):
+        """Generates the index.md file."""
+        index_path = os.path.join(self.output_subdir, "index.md")
+        with open(index_path, 'w', encoding='utf-8') as f:
+            f.write("# Documentation Index\n\n")
+            f.write("| Title | Original URL | Local File |\n")
+            f.write("|-------|--------------|------------|\n")
+            for item in sorted(self.results, key=lambda x: x['title']):
+                f.write(f"| {item['title']} | [{item['url']}]({item['url']}) | [{item['file']}]({item['file']}) |\n")
+        logger.info(f"Generated index at {index_path}")
+    def run(self, urls=None, start_url=None, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
+        """
+        Run the crawler.
+        Args:
+            urls: List of URLs to crawl. If None, uses discover_links method.
+            start_url: Starting URL for recursive discovery (if needed)
+            path_filter: Path pattern to filter links (default: '/docs/')
+            max_depth: Maximum number of URLs to discover in recursive mode
+        """
+        if urls is None:
+            urls = self.discover_links(start_url, path_filter, max_depth)
+        if not urls:
+            logger.warning("No URLs found to process.")
+            return
+        logger.info(f"Starting download of {len(urls)} pages using Playwright...")
+        with sync_playwright() as p:
+            # 启动浏览器
+            browser = p.chromium.launch(headless=True)
+            context = browser.new_context(
+                user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+            )
+            page = context.new_page()
+            # 使用 tqdm 显示进度
+            for url in tqdm(urls, unit="page"):
+                result = self.process_url_with_playwright(page, url)
+                if result:
+                    self.results.append(result)
+            browser.close()
+        self.generate_index()
+        logger.info("Done.")

docs_crawler-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,262 @@
+Metadata-Version: 2.4
+Name: docs-crawler
+Version: 0.1.0
+Summary: A documentation crawler that converts web documentation to Markdown format
+License: MIT
+License-File: LICENSE
+Keywords: crawler,documentation,markdown,scraper
+Author: nev4rb14su
+Requires-Python: >=3.8.1,<4.0.0
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Programming Language :: Python :: 3.8
+Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
+Requires-Dist: lxml (>=5.0.0,<6.0.0)
+Requires-Dist: markdownify (>=0.11.0,<0.12.0)
+Requires-Dist: playwright (>=1.40.0,<2.0.0)
+Requires-Dist: requests (>=2.31.0,<3.0.0)
+Requires-Dist: tqdm (>=4.66.0,<5.0.0)
+Project-URL: Homepage, https://github.com/neverbiasu/docs-crawler
+Description-Content-Type: text/markdown
+# Docs Crawler
+A powerful documentation crawler that converts web documentation to Markdown format using Playwright for JavaScript-rendered content.
+## Features
+- **Smart Link Discovery**: Tries sitemap first, automatically falls back to recursive link discovery
+- **Discover Mode**: Find and save documentation URLs before crawling
+- Crawls documentation from sitemaps or URL lists
+- Uses Playwright to handle JavaScript-rendered Single Page Applications (SPAs)
+- Converts HTML to clean Markdown format
+- Auto-detects domain-based folder structure
+- Generates an index of all crawled pages
+- Progress tracking with tqdm
+- Retry logic for failed requests
+## Requirements
+- Python 3.8+
+- Poetry (for dependency management)
+## Installation
+### Using Poetry (Recommended)
+```bash
+# Install Poetry if you haven't already
+curl -sSL https://install.python-poetry.org | python3 -
+# Clone the repository
+git clone https://github.com/neverbiasu/docs-crawler.git
+cd docs-crawler
+# Install dependencies
+poetry install
+# Install Playwright browsers
+poetry run playwright install chromium
+```
+### Using pip
+```bash
+pip install docs-crawler
+playwright install chromium
+```
+## Usage
+### Command Line Interface
+The package provides a `docs-crawler` command with three modes:
+#### 1. Sitemap Mode (Default)
+Tries to fetch URLs from sitemap first, automatically falls back to recursive link discovery if sitemap is not available.
+```bash
+# Crawl from sitemap (with automatic fallback)
+poetry run docs-crawler --base-url https://example.com
+# Specify custom sitemap URL
+poetry run docs-crawler --sitemap-url https://example.com/custom-sitemap.xml
+# Customize path filter and max URLs to discover
+poetry run docs-crawler --base-url https://example.com --path-filter /docs/ --max-depth 200
+```
+#### 2. Discover Mode
+Discover all documentation URLs and save them to a file for review before crawling.
+```bash
+# Discover links and save to auto-generated file (e.g., example_urls.txt)
+poetry run docs-crawler --mode discover --base-url https://example.com
+# Specify custom output file
+poetry run docs-crawler --mode discover --base-url https://example.com --output-file my-urls.txt
+# Start from a specific URL
+poetry run docs-crawler --mode discover --start-url https://example.com/docs/intro
+# Customize discovery settings
+poetry run docs-crawler --mode discover --base-url https://example.com --path-filter /api/ --max-depth 50
+```
+The discover mode will:
+1. Find all documentation links (using sitemap or recursive discovery)
+2. Display the first 10 URLs as a preview
+3. Ask for your confirmation before saving
+4. Save URLs to a file named `{subdomain}_urls.txt` (e.g., `example_urls.txt`)
+#### 3. List Mode
+Crawl from a list of URLs in a text file.
+```bash
+# Crawl from URL list
+poetry run docs-crawler --mode list --file urls.txt
+# Specify custom output folder
+poetry run docs-crawler --mode list --file urls.txt --folder my-docs
+```
+#### Common Options
+```bash
+# Custom output directory
+--output-dir custom-output
+# Custom folder name
+--folder my-docs
+# Path filter for link discovery (default: /docs/)
+--path-filter /documentation/
+# Maximum URLs to discover (default: 100)
+--max-depth 500
+# Starting URL for recursive discovery
+--start-url https://example.com/docs/
+```
+### Python API
+```python
+from docs_crawler import Crawler
+# Create crawler instance
+crawler = Crawler(
+    base_url="https://antigravity.google",
+    output_dir="output",
+    custom_folder="antigravity"
+)
+# Run with automatic link discovery (sitemap first, then recursive)
+crawler.run()
+# Discover links only
+urls = crawler.discover_links(
+    start_url="https://example.com/docs/",
+    path_filter="/docs/",
+    max_depth=100
+)
+print(f"Found {len(urls)} URLs")
+# Run with custom URLs
+crawler.run(urls=[
+    "https://example.com/docs/page1",
+    "https://example.com/docs/page2"
+])
+# Run with custom discovery settings
+crawler.run(
+    start_url="https://example.com/docs/intro",
+    path_filter="/documentation/",
+    max_depth=200
+)
+```
+## Output
+- The downloaded Markdown files will be saved in the `output/` directory (or custom directory).
+- An index of all downloaded pages is available at `output/{folder}/index.md`.
+- Files are organized by domain or custom folder name.
+## Development
+```bash
+# Install development dependencies
+poetry install --with dev
+# Run tests
+poetry run pytest
+# Format code
+poetry run black .
+# Lint code
+poetry run flake8
+# Type checking
+poetry run mypy docs_crawler
+```
+## Configuration
+The crawler can be configured through:
+- Command-line arguments
+- Python API parameters
+- Environment variables (coming soon)
+## How It Works
+### Link Discovery
+The crawler uses a smart two-step approach:
+1. **Sitemap First**: Attempts to fetch URLs from the sitemap.xml file
+2. **Recursive Discovery Fallback**: If sitemap is unavailable or empty, automatically discovers links by:
+   - Starting from a base URL (e.g., `/docs/`)
+   - Extracting all internal links matching the path filter
+   - Recursively crawling pages to find more documentation links
+   - Respecting the max-depth limit to avoid excessive crawling
+### Workflow Example
+```bash
+# Step 1: Discover links and save for review
+poetry run docs-crawler --mode discover --base-url https://example.com
+# Output: example_urls.txt
+# Step 2: Review and edit urls.txt if needed
+# (Remove unwanted URLs, add missing ones, etc.)
+# Step 3: Crawl the URLs
+poetry run docs-crawler --mode list --file example_urls.txt
+```
+## Notes
+- The crawler uses Playwright to handle JavaScript-rendered content, making it suitable for modern SPAs.
+- Default path filter is `/docs/` but can be customized with `--path-filter`
+- Respects retry limits and timeouts to be polite to servers.
+- Auto-detects domain-based folder structure or uses custom folder names.
+- Recursive discovery avoids infinite loops by tracking visited URLs
+- URL files are named using the subdomain for easy identification (e.g., `github_urls.txt`, `example_urls.txt`)
+## License
+MIT License - see LICENSE file for details.
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.

docs_crawler-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+docs_crawler/__init__.py,sha256=9IfLYZnZk_SS5HeOlZ0REqviVItH2JJcsbwq1Sd2Yyc,170
+docs_crawler/cli.py,sha256=3jdQZyZdsN9pNVoi0J1zRTmKttuB4ctEvbjNJqw4ghA,7682
+docs_crawler/crawler.py,sha256=1Q1oX8gmWiorR0dUngezTNmV8jvTibfTc5aZQ7cBe88,14647
+docs_crawler-0.1.0.dist-info/METADATA,sha256=cQ0d7tpUrOALw6RJfF1p9teGKAFIjzydG8hOF6BvFf0,7345
+docs_crawler-0.1.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+docs_crawler-0.1.0.dist-info/entry_points.txt,sha256=S5Z3NqFSSjBLd7yO1hF8pHYH7rYOlYHflvauHc7nRpM,54
+docs_crawler-0.1.0.dist-info/licenses/LICENSE,sha256=igDghpYK4aLmOc5L2tYBYrHZuw1A3sFbZAv2nd3lSsg,1067
+docs_crawler-0.1.0.dist-info/RECORD,,

docs_crawler-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: poetry-core 2.2.1
+Root-Is-Purelib: true
+Tag: py3-none-any

docs_crawler-0.1.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+docs-crawler=docs_crawler.cli:main

docs_crawler-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 nev4rb14su
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.