PyPI - scitex - Versions diffs - 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl - Mend

scitex 2.11.0py3-none-any.whl → 2.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

scitex/__main__.py +24 -5
scitex/__version__.py +1 -1
scitex/_optional_deps.py +33 -0
scitex/ai/classification/reporters/_ClassificationReporter.py +1 -1
scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +2 -2
scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +2 -2
scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +2 -2
scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +2 -2
scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +2 -2
scitex/ai/classification/timeseries/_normalize_timestamp.py +1 -1
scitex/ai/metrics/_calc_seizure_prediction_metrics.py +1 -1
scitex/ai/plt/_plot_feature_importance.py +1 -1
scitex/ai/plt/_plot_learning_curve.py +1 -1
scitex/ai/plt/_plot_optuna_study.py +1 -1
scitex/ai/plt/_plot_pre_rec_curve.py +1 -1
scitex/ai/plt/_plot_roc_curve.py +1 -1
scitex/ai/plt/_stx_conf_mat.py +1 -1
scitex/ai/training/_LearningCurveLogger.py +1 -1
scitex/audio/mcp_server.py +38 -8
scitex/browser/automation/CookieHandler.py +1 -1
scitex/browser/core/BrowserMixin.py +1 -1
scitex/browser/core/ChromeProfileManager.py +1 -1
scitex/browser/debugging/_browser_logger.py +1 -1
scitex/browser/debugging/_highlight_element.py +1 -1
scitex/browser/debugging/_show_grid.py +1 -1
scitex/browser/interaction/click_center.py +1 -1
scitex/browser/interaction/click_with_fallbacks.py +1 -1
scitex/browser/interaction/close_popups.py +1 -1
scitex/browser/interaction/fill_with_fallbacks.py +1 -1
scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +1 -1
scitex/browser/pdf/detect_chrome_pdf_viewer.py +1 -1
scitex/browser/stealth/HumanBehavior.py +1 -1
scitex/browser/stealth/StealthManager.py +1 -1
scitex/canvas/_mcp_handlers.py +372 -0
scitex/canvas/_mcp_tool_schemas.py +219 -0
scitex/canvas/mcp_server.py +151 -0
scitex/capture/mcp_server.py +41 -12
scitex/cli/audio.py +233 -0
scitex/cli/capture.py +307 -0
scitex/cli/main.py +27 -4
scitex/cli/repro.py +233 -0
scitex/cli/resource.py +240 -0
scitex/cli/stats.py +325 -0
scitex/cli/template.py +236 -0
scitex/cli/tex.py +286 -0
scitex/cli/web.py +11 -12
scitex/dev/__init__.py +3 -0
scitex/dev/_pyproject.py +405 -0
scitex/dev/plt/__init__.py +2 -2
scitex/dev/plt/mpl/get_dir_ax.py +1 -1
scitex/dev/plt/mpl/get_signatures.py +1 -1
scitex/dev/plt/mpl/get_signatures_details.py +1 -1
scitex/diagram/_mcp_handlers.py +400 -0
scitex/diagram/_mcp_tool_schemas.py +157 -0
scitex/diagram/mcp_server.py +151 -0
scitex/dsp/_demo_sig.py +51 -5
scitex/dsp/_mne.py +13 -2
scitex/dsp/_modulation_index.py +15 -3
scitex/dsp/_pac.py +23 -5
scitex/dsp/_psd.py +16 -4
scitex/dsp/_resample.py +24 -4
scitex/dsp/_transform.py +16 -3
scitex/dsp/add_noise.py +15 -1
scitex/dsp/norm.py +17 -2
scitex/dsp/reference.py +17 -1
scitex/dsp/utils/_differential_bandpass_filters.py +20 -2
scitex/dsp/utils/_zero_pad.py +18 -4
scitex/dt/_normalize_timestamp.py +1 -1
scitex/git/_session.py +1 -1
scitex/io/_load_modules/_con.py +12 -1
scitex/io/_load_modules/_eeg.py +12 -1
scitex/io/_load_modules/_optuna.py +21 -63
scitex/io/_load_modules/_torch.py +11 -3
scitex/io/_save_modules/_optuna_study_as_csv_and_pngs.py +13 -2
scitex/io/_save_modules/_torch.py +11 -3
scitex/mcp_server.py +159 -0
scitex/plt/_mcp_handlers.py +361 -0
scitex/plt/_mcp_tool_schemas.py +169 -0
scitex/plt/mcp_server.py +205 -0
scitex/repro/README_RandomStateManager.md +3 -3
scitex/repro/_RandomStateManager.py +14 -14
scitex/repro/_gen_ID.py +1 -1
scitex/repro/_gen_timestamp.py +1 -1
scitex/repro/_hash_array.py +4 -4
scitex/scholar/__main__.py +24 -2
scitex/scholar/_mcp_handlers.py +685 -0
scitex/scholar/_mcp_tool_schemas.py +339 -0
scitex/scholar/docs/template.py +1 -1
scitex/scholar/examples/07_storage_integration.py +1 -1
scitex/scholar/impact_factor/jcr/ImpactFactorJCREngine.py +1 -1
scitex/scholar/impact_factor/jcr/build_database.py +1 -1
scitex/scholar/mcp_server.py +315 -0
scitex/scholar/pdf_download/ScholarPDFDownloader.py +1 -1
scitex/scholar/pipelines/ScholarPipelineBibTeX.py +1 -1
scitex/scholar/pipelines/ScholarPipelineParallel.py +1 -1
scitex/scholar/pipelines/ScholarPipelineSingle.py +1 -1
scitex/scholar/storage/PaperIO.py +1 -1
scitex/session/README.md +4 -4
scitex/session/__init__.py +1 -1
scitex/session/_decorator.py +9 -9
scitex/session/_lifecycle.py +5 -5
scitex/session/template.py +1 -1
scitex/stats/__main__.py +281 -0
scitex/stats/_mcp_handlers.py +1191 -0
scitex/stats/_mcp_tool_schemas.py +384 -0
scitex/stats/correct/_correct_bonferroni.py +1 -1
scitex/stats/correct/_correct_fdr.py +1 -1
scitex/stats/correct/_correct_fdr_.py +1 -1
scitex/stats/correct/_correct_holm.py +1 -1
scitex/stats/correct/_correct_sidak.py +1 -1
scitex/stats/effect_sizes/_cliffs_delta.py +1 -1
scitex/stats/effect_sizes/_cohens_d.py +1 -1
scitex/stats/effect_sizes/_epsilon_squared.py +1 -1
scitex/stats/effect_sizes/_eta_squared.py +1 -1
scitex/stats/effect_sizes/_prob_superiority.py +1 -1
scitex/stats/mcp_server.py +405 -0
scitex/stats/posthoc/_dunnett.py +1 -1
scitex/stats/posthoc/_games_howell.py +1 -1
scitex/stats/posthoc/_tukey_hsd.py +1 -1
scitex/stats/power/_power.py +1 -1
scitex/stats/utils/_effect_size.py +1 -1
scitex/stats/utils/_formatters.py +1 -1
scitex/stats/utils/_power.py +1 -1
scitex/template/_mcp_handlers.py +259 -0
scitex/template/_mcp_tool_schemas.py +112 -0
scitex/template/mcp_server.py +186 -0
scitex/utils/_verify_scitex_format.py +2 -2
scitex/utils/template.py +1 -1
scitex/web/__init__.py +12 -11
scitex/web/_scraping.py +26 -265
scitex/web/download_images.py +316 -0
scitex/writer/Writer.py +1 -1
scitex/writer/_clone_writer_project.py +1 -1
scitex/writer/_validate_tree_structures.py +1 -1
scitex/writer/dataclasses/config/_WriterConfig.py +1 -1
scitex/writer/dataclasses/contents/_ManuscriptContents.py +1 -1
scitex/writer/dataclasses/core/_Document.py +1 -1
scitex/writer/dataclasses/core/_DocumentSection.py +1 -1
scitex/writer/dataclasses/results/_CompilationResult.py +1 -1
scitex/writer/dataclasses/results/_LaTeXIssue.py +1 -1
scitex/writer/utils/.legacy_git_retry.py +7 -5
scitex/writer/utils/_parse_latex_logs.py +1 -1
{scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/METADATA +431 -269
{scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/RECORD +147 -118
scitex-2.13.0.dist-info/entry_points.txt +11 -0
scitex-2.11.0.dist-info/entry_points.txt +0 -2
{scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/WHEEL +0 -0
{scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/licenses/LICENSE +0 -0

scitex/web/_scraping.py CHANGED Viewed

@@ -1,40 +1,21 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # File: ./src/scitex/web/_scraping.py
-"""Web scraping utilities for extracting URLs and downloading images."""
+"""Web scraping utilities for extracting URLs."""
-import os
 import re
 import urllib.parse
-from datetime import datetime
-from pathlib import Path
-from typing import List, Optional, Set, Tuple
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Optional, Set
 import requests
 from bs4 import BeautifulSoup
-from tqdm import tqdm
-try:
-    from PIL import Image
-    from io import BytesIO
-    PILLOW_AVAILABLE = True
-except ImportError:
-    PILLOW_AVAILABLE = False
 from scitex.logging import getLogger
 logger = getLogger(__name__)
-def _get_default_download_dir() -> str:
-    """Get default download directory using SCITEX_DIR if available."""
-    scitex_root = os.environ.get("SCITEX_DIR")
-    if scitex_root is None:
-        scitex_root = os.path.expanduser("~/.scitex")
-    return os.path.join(scitex_root, "web", "downloads")
+DEFAULT_TIMEOUT = 10
+DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 def get_urls(
@@ -49,7 +30,7 @@ def get_urls(
     Args:
         url: The URL of the webpage to scrape
-        pattern: Optional regex pattern to filter URLs (e.g., r'\.pdf$' for PDF files)
+        pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
         absolute: If True, convert relative URLs to absolute URLs
         same_domain: If True, only return URLs from the same domain
         include_external: If True, include external links (only applies if same_domain=False)
@@ -58,12 +39,16 @@ def get_urls(
         List of URLs found on the page
     Example:
-        >>> urls = get_urls('https://example.com', pattern=r'\.pdf$')
+        >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
         >>> urls = get_urls('https://example.com', same_domain=True)
     """
     try:
         logger.info(f"Fetching URLs from: {url}")
-        response = requests.get(url, timeout=30)
+        response = requests.get(
+            url,
+            timeout=DEFAULT_TIMEOUT,
+            headers={"User-Agent": DEFAULT_USER_AGENT},
+        )
         response.raise_for_status()
     except requests.RequestException as e:
         logger.error(f"Failed to fetch URL {url}: {e}")
@@ -72,19 +57,14 @@ def get_urls(
     soup = BeautifulSoup(response.text, "html.parser")
     urls_found: Set[str] = set()
-    # Parse the base domain
     parsed_base = urllib.parse.urlparse(url)
-    base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
-    # Find all links
     for link in soup.find_all("a", href=True):
         href = link["href"]
-        # Convert to absolute URL if requested
         if absolute:
             href = urllib.parse.urljoin(url, href)
-        # Filter by domain if requested
         if same_domain:
             parsed_href = urllib.parse.urlparse(href)
             if parsed_href.netloc != parsed_base.netloc:
@@ -94,10 +74,8 @@ def get_urls(
             if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
                 continue
-        # Filter by pattern if provided
-        if pattern:
-            if not re.search(pattern, href):
-                continue
+        if pattern and not re.search(pattern, href):
+            continue
         urls_found.add(href)
@@ -106,171 +84,6 @@ def get_urls(
     return result
-def download_images(
-    url: str,
-    output_dir: Optional[str] = None,
-    pattern: Optional[str] = None,
-    min_size: Optional[Tuple[int, int]] = None,
-    max_workers: int = 5,
-    same_domain: bool = False,
-) -> List[str]:
-    """
-    Download all images from a webpage.
-    Args:
-        url: The URL of the webpage to scrape
-        output_dir: Directory to save images. Priority:
-                   1. This parameter if specified
-                   2. $SCITEX_WEB_DOWNLOADS_DIR environment variable
-                   3. $SCITEX_DIR/web/downloads (default)
-        pattern: Optional regex pattern to filter image URLs
-        min_size: Optional minimum size as (width, height) tuple to filter images
-        max_workers: Number of concurrent download threads
-        same_domain: If True, only download images from the same domain
-    Returns:
-        List of paths to downloaded images
-    Note:
-        - SVG files are automatically skipped (vector graphics)
-        - Images are saved in timestamped subdirectories: images-YYYYMMDD_HHMMSS/
-    Example:
-        >>> paths = download_images('https://example.com', output_dir='./downloads')
-        >>> paths = download_images('https://example.com', min_size=(100, 100))
-        >>> # Uses $SCITEX_WEB_DOWNLOADS_DIR or $SCITEX_DIR/web/downloads
-        >>> paths = download_images('https://example.com')
-    """
-    if not PILLOW_AVAILABLE:
-        logger.warning("Pillow is not available. min_size filtering will be disabled.")
-    # Set default output directory
-    if output_dir is None:
-        # Check SCITEX_WEB_DOWNLOADS_DIR first
-        output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
-        if output_dir is None:
-            # Fall back to SCITEX_DIR/web/downloads
-            output_dir = _get_default_download_dir()
-    # Create timestamped subdirectory
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_path = Path(output_dir).expanduser() / f"images-{timestamp}"
-    output_path.mkdir(parents=True, exist_ok=True)
-    logger.info(f"Saving images to: {output_path}")
-    try:
-        logger.info(f"Fetching page: {url}")
-        response = requests.get(url, timeout=30)
-        response.raise_for_status()
-    except requests.RequestException as e:
-        logger.error(f"Failed to fetch URL {url}: {e}")
-        return []
-    soup = BeautifulSoup(response.text, "html.parser")
-    image_urls: Set[str] = set()
-    # Parse the base domain
-    parsed_base = urllib.parse.urlparse(url)
-    # Find all image tags
-    for img in soup.find_all("img", src=True):
-        img_url = img["src"]
-        # Convert to absolute URL
-        img_url = urllib.parse.urljoin(url, img_url)
-        # Skip SVG files (vector graphics, not raster images)
-        if img_url.lower().endswith((".svg", ".svgz")):
-            continue
-        # Filter by domain if requested
-        if same_domain:
-            parsed_img = urllib.parse.urlparse(img_url)
-            if parsed_img.netloc != parsed_base.netloc:
-                continue
-        # Filter by pattern if provided
-        if pattern:
-            if not re.search(pattern, img_url):
-                continue
-        image_urls.add(img_url)
-    logger.info(f"Found {len(image_urls)} images")
-    # Download images
-    downloaded_paths = []
-    def download_image(img_url: str) -> Optional[str]:
-        try:
-            img_response = requests.get(img_url, timeout=30)
-            img_response.raise_for_status()
-            # Check image size if requested and Pillow is available
-            if min_size and PILLOW_AVAILABLE:
-                try:
-                    img = Image.open(BytesIO(img_response.content))
-                    if img.size[0] < min_size[0] or img.size[1] < min_size[1]:
-                        return None
-                except Exception:
-                    pass
-            # Generate filename from URL
-            parsed_url = urllib.parse.urlparse(img_url)
-            filename = Path(parsed_url.path).name
-            # If filename is empty or doesn't have extension, generate one
-            if not filename or "." not in filename:
-                ext = ".jpg"  # default extension
-                if "content-type" in img_response.headers:
-                    content_type = img_response.headers["content-type"]
-                    if "png" in content_type:
-                        ext = ".png"
-                    elif "gif" in content_type:
-                        ext = ".gif"
-                    elif "webp" in content_type:
-                        ext = ".webp"
-                filename = f"image_{hash(img_url)}{ext}"
-            # Save image
-            file_path = output_path / filename
-            # Handle duplicate filenames
-            counter = 1
-            original_stem = file_path.stem
-            while file_path.exists():
-                file_path = output_path / f"{original_stem}_{counter}{file_path.suffix}"
-                counter += 1
-            with open(file_path, "wb") as f:
-                f.write(img_response.content)
-            return str(file_path)
-        except Exception as e:
-            logger.warning(f"Failed to download image {img_url}: {e}")
-            return None
-    # Download images concurrently
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_url = {
-            executor.submit(download_image, img_url): img_url for img_url in image_urls
-        }
-        for future in tqdm(
-            as_completed(future_to_url),
-            total=len(image_urls),
-            desc="Downloading images",
-        ):
-            result = future.result()
-            if result:
-                downloaded_paths.append(result)
-    logger.info(f"Downloaded {len(downloaded_paths)} images to {output_dir}")
-    return downloaded_paths
 def get_image_urls(
     url: str,
     pattern: Optional[str] = None,
@@ -289,14 +102,19 @@ def get_image_urls(
     Note:
         - SVG files are automatically skipped (vector graphics)
+        - Checks both 'src' and 'data-src' attributes for lazy-loaded images
     Example:
         >>> img_urls = get_image_urls('https://example.com')
-        >>> img_urls = get_image_urls('https://example.com', pattern=r'\.png$')
+        >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
     """
     try:
         logger.info(f"Fetching image URLs from: {url}")
-        response = requests.get(url, timeout=30)
+        response = requests.get(
+            url,
+            timeout=DEFAULT_TIMEOUT,
+            headers={"User-Agent": DEFAULT_USER_AGENT},
+        )
         response.raise_for_status()
     except requests.RequestException as e:
         logger.error(f"Failed to fetch URL {url}: {e}")
@@ -305,85 +123,28 @@ def get_image_urls(
     soup = BeautifulSoup(response.text, "html.parser")
     image_urls: Set[str] = set()
-    # Parse the base domain
     parsed_base = urllib.parse.urlparse(url)
-    # Find all image tags
-    for img in soup.find_all("img", src=True):
-        img_url = img["src"]
+    for img in soup.find_all("img"):
+        img_url = img.get("src") or img.get("data-src")
+        if not img_url:
+            continue
-        # Convert to absolute URL
         img_url = urllib.parse.urljoin(url, img_url)
-        # Skip SVG files (vector graphics, not raster images)
         if img_url.lower().endswith((".svg", ".svgz")):
             continue
-        # Filter by domain if requested
         if same_domain:
             parsed_img = urllib.parse.urlparse(img_url)
             if parsed_img.netloc != parsed_base.netloc:
                 continue
-        # Filter by pattern if provided
-        if pattern:
-            if not re.search(pattern, img_url):
-                continue
+        if pattern and not re.search(pattern, img_url):
+            continue
         image_urls.add(img_url)
     result = sorted(list(image_urls))
     logger.info(f"Found {len(result)} image URLs")
     return result
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Web scraping utilities")
-    parser.add_argument("url", type=str, help="URL to scrape")
-    parser.add_argument(
-        "--mode",
-        "-m",
-        choices=["urls", "images", "image_urls"],
-        default="urls",
-        help="Scraping mode",
-    )
-    parser.add_argument("--output", "-o", type=str, help="Output directory for images")
-    parser.add_argument(
-        "--pattern", "-p", type=str, help="Regex pattern to filter URLs"
-    )
-    parser.add_argument(
-        "--same-domain", action="store_true", help="Only include URLs from same domain"
-    )
-    parser.add_argument(
-        "--min-size", type=str, help="Minimum image size as WIDTHxHEIGHT"
-    )
-    args = parser.parse_args()
-    if args.mode == "urls":
-        urls = get_urls(args.url, pattern=args.pattern, same_domain=args.same_domain)
-        for url in urls:
-            print(url)
-    elif args.mode == "images":
-        min_size = None
-        if args.min_size:
-            width, height = map(int, args.min_size.split("x"))
-            min_size = (width, height)
-        paths = download_images(
-            args.url,
-            output_dir=args.output,
-            pattern=args.pattern,
-            min_size=min_size,
-            same_domain=args.same_domain,
-        )
-        for path in paths:
-            print(path)
-    elif args.mode == "image_urls":
-        img_urls = get_image_urls(
-            args.url, pattern=args.pattern, same_domain=args.same_domain
-        )
-        for img_url in img_urls:
-            print(img_url)

scitex/web/download_images.py ADDED Viewed

@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# File: ./src/scitex/web/download_images.py
+"""
+Image Downloader for SciTeX.
+Downloads images from URLs with minimum size filtering.
+Usage:
+    python -m scitex.web.download_images https://example.com
+    python -m scitex.web.download_images https://example.com -o ./downloads
+    python -m scitex.web.download_images https://example.com --min-size 800x600
+"""
+import os
+import re
+import urllib.parse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Tuple
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+try:
+    from io import BytesIO
+    from PIL import Image
+    PILLOW_AVAILABLE = True
+except ImportError:
+    PILLOW_AVAILABLE = False
+from scitex.logging import getLogger
+logger = getLogger(__name__)
+# Configuration
+DEFAULT_MIN_WIDTH = 400
+DEFAULT_MIN_HEIGHT = 300
+DEFAULT_TIMEOUT = 10
+DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+def _get_default_download_dir() -> str:
+    """Get default download directory using SCITEX_DIR if available."""
+    scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
+    return os.path.join(scitex_root, "web", "downloads")
+def _normalize_url_for_directory(url: str) -> str:
+    """Convert URL to a safe directory name."""
+    parsed = urllib.parse.urlparse(url)
+    domain = parsed.netloc.replace("www.", "")
+    path = parsed.path.strip("/").replace("/", "-")
+    normalized = f"{domain}-{path}" if path else domain
+    normalized = re.sub(r"[^\w\-.]", "-", normalized)
+    normalized = re.sub(r"-+", "-", normalized)
+    normalized = normalized[:100].strip("-")
+    return normalized
+def _is_direct_image_url(url: str) -> bool:
+    """Check if URL appears to be a direct image link."""
+    extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
+    path = urllib.parse.urlparse(url.lower()).path
+    return any(path.endswith(ext) for ext in extensions)
+def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
+    """Extract image URLs from a webpage."""
+    try:
+        logger.info(f"Fetching page: {url}")
+        response = requests.get(
+            url,
+            timeout=DEFAULT_TIMEOUT,
+            headers={"User-Agent": DEFAULT_USER_AGENT},
+        )
+        response.raise_for_status()
+    except requests.RequestException as e:
+        logger.error(f"Failed to fetch page: {e}")
+        return []
+    soup = BeautifulSoup(response.content, "html.parser")
+    parsed_base = urllib.parse.urlparse(url)
+    image_urls = set()
+    for img in soup.find_all("img"):
+        img_url = img.get("src") or img.get("data-src")
+        if not img_url:
+            continue
+        img_url = urllib.parse.urljoin(url, img_url)
+        if img_url.lower().endswith((".svg", ".svgz")):
+            continue
+        if same_domain:
+            parsed_img = urllib.parse.urlparse(img_url)
+            if parsed_img.netloc != parsed_base.netloc:
+                continue
+        image_urls.add(img_url)
+    logger.info(f"Found {len(image_urls)} images on page")
+    return list(image_urls)
+def _download_single_image(
+    img_url: str,
+    output_dir: Path,
+    counter: int,
+    min_size: Optional[Tuple[int, int]],
+) -> Optional[str]:
+    """Download a single image."""
+    try:
+        response = requests.get(
+            img_url,
+            timeout=DEFAULT_TIMEOUT,
+            headers={"User-Agent": DEFAULT_USER_AGENT},
+        )
+        response.raise_for_status()
+        # Validate content-type
+        content_type = response.headers.get("content-type", "")
+        if not content_type.startswith("image/"):
+            logger.debug(f"Skipping non-image: {content_type}")
+            return None
+        # Check dimensions
+        if min_size and PILLOW_AVAILABLE:
+            try:
+                img = Image.open(BytesIO(response.content))
+                width, height = img.size
+                if width < min_size[0] or height < min_size[1]:
+                    logger.debug(
+                        f"Skipping small image: {width}x{height} "
+                        f"(min: {min_size[0]}x{min_size[1]})"
+                    )
+                    return None
+            except Exception:
+                pass
+        # Determine extension
+        ext = "jpg"
+        if PILLOW_AVAILABLE:
+            try:
+                img = Image.open(BytesIO(response.content))
+                fmt = img.format.lower() if img.format else "jpeg"
+                ext = "jpg" if fmt == "jpeg" else fmt
+            except Exception:
+                pass
+        elif "png" in content_type:
+            ext = "png"
+        elif "gif" in content_type:
+            ext = "gif"
+        elif "webp" in content_type:
+            ext = "webp"
+        filename = f"{counter:04d}.{ext}"
+        filepath = output_dir / filename
+        with open(filepath, "wb") as f:
+            f.write(response.content)
+        logger.info(f"Downloaded: {filename}")
+        return str(filepath)
+    except Exception as e:
+        logger.warning(f"Error downloading {img_url}: {e}")
+        return None
+def download_images(
+    url: str,
+    output_dir: Optional[str] = None,
+    min_size: Optional[Tuple[int, int]] = None,
+    max_workers: int = 5,
+    same_domain: bool = False,
+) -> List[str]:
+    """
+    Download images from a URL.
+    Args:
+        url: Webpage URL or direct image URL
+        output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
+        min_size: Minimum (width, height) to filter small images (default: 400x300)
+        max_workers: Concurrent download threads
+        same_domain: Only download images from the same domain
+    Returns:
+        List of downloaded file paths
+    Example:
+        >>> paths = download_images("https://example.com")
+        >>> paths = download_images("https://example.com/photo.jpg")
+        >>> paths = download_images("https://example.com", min_size=(800, 600))
+    """
+    if not PILLOW_AVAILABLE:
+        logger.warning("Pillow not available. Size filtering disabled.")
+        min_size = None
+    elif min_size is None:
+        min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)
+    # Setup output directory
+    if output_dir is None:
+        output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
+        if output_dir is None:
+            output_dir = _get_default_download_dir()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    normalized = _normalize_url_for_directory(url)
+    output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
+    output_path.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Output directory: {output_path}")
+    # Get image URLs
+    if _is_direct_image_url(url):
+        image_urls = [url]
+        logger.info("Direct image URL detected")
+    else:
+        image_urls = _extract_image_urls(url, same_domain=same_domain)
+    if not image_urls:
+        logger.warning("No images found")
+        return []
+    # Download concurrently
+    downloaded = []
+    counter = [1]
+    def download_with_counter(img_url: str) -> Optional[str]:
+        idx = counter[0]
+        counter[0] += 1
+        return _download_single_image(img_url, output_path, idx, min_size)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(download_with_counter, u): u for u in image_urls}
+        for future in tqdm(
+            as_completed(futures), total=len(image_urls), desc="Downloading"
+        ):
+            result = future.result()
+            if result:
+                downloaded.append(result)
+    logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
+    return downloaded
+def main():
+    """CLI entry point."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Download images from URL",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m scitex.web.download_images https://example.com
+  python -m scitex.web.download_images https://example.com -o ./downloads
+  python -m scitex.web.download_images https://example.com --min-size 800x600
+  python -m scitex.web.download_images https://example.com --no-min-size
+        """,
+    )
+    parser.add_argument("url", help="URL to download images from")
+    parser.add_argument("-o", "--output", help="Output directory")
+    parser.add_argument(
+        "--min-size",
+        default="400x300",
+        help="Minimum size WIDTHxHEIGHT (default: 400x300)",
+    )
+    parser.add_argument(
+        "--no-min-size",
+        action="store_true",
+        help="Disable size filtering",
+    )
+    parser.add_argument(
+        "--same-domain",
+        action="store_true",
+        help="Only download from same domain",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=5,
+        help="Concurrent downloads (default: 5)",
+    )
+    args = parser.parse_args()
+    min_size = None
+    if not args.no_min_size and args.min_size:
+        w, h = map(int, args.min_size.split("x"))
+        min_size = (w, h)
+    paths = download_images(
+        args.url,
+        output_dir=args.output,
+        min_size=min_size,
+        max_workers=args.workers,
+        same_domain=args.same_domain,
+    )
+    print(f"\nDownloaded {len(paths)} images:")
+    for p in paths:
+        print(f"  {p}")
+if __name__ == "__main__":
+    main()

scitex 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

scitex 2.11.0py3-none-any.whl → 2.13.0py3-none-any.whl