PyPI - rust-crate-pipeline - Versions diffs - 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl - Mend

rust-crate-pipeline 1.3.2py3-none-any.whl → 1.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

rust_crate_pipeline/network.py CHANGED Viewed

@@ -1,327 +1,321 @@
-# network.py
-import os
-import sys
-import re
-import time
-import logging
-import requests
-from bs4 import BeautifulSoup, Tag
-from typing import Any, Union
-from .config import PipelineConfig
-# Import utilities
-# Add the parent directory to the path to import utils
-sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
-class GitHubBatchClient:
-    def __init__(self, config: PipelineConfig) -> None:
-        self.config = config
-        # Simple headers without dependency on HTTPClientUtils
-        self.headers = {
-            "Accept": "application/vnd.github.v3+json",
-            "User-Agent": "SigilDERG-Data-Production/1.3.2",
-        }
-        if config.github_token:
-            self.headers["Authorization"] = f"token {config.github_token}"
-        # Simple session without dependency on HTTPClientUtils
-        self.session = requests.Session()
-        self.session.headers.update(self.headers)
-        self.remaining_calls = 5000
-        self.reset_time = 0
-    def check_rate_limit(self) -> None:
-        """Check and update current rate limit status"""
-        try:
-            response = self.session.get(
-                "https://api.github.com/rate_limit", headers=self.headers
-            )
-            if response.ok:
-                data = response.json()
-                self.remaining_calls = data["resources"]["core"]["remaining"]
-                self.reset_time = data["resources"]["core"]["reset"]
-                if self.remaining_calls < 100:
-                    reset_in = self.reset_time - time.time()
-                    logging.warning(
-                        f"GitHub API rate limit low: {self.remaining_calls} remaining. "
-                        f"Resets in {reset_in / 60:.1f} minutes"
-                    )
-        except Exception:
-            pass
-    def get_repo_stats(self, owner: str, repo: str) -> "dict[str, Any]":
-        """Get repository statistics"""
-        try:
-            url = f"https://api.github.com/repos/{owner}/{repo}"
-            response = self.session.get(url, headers=self.headers)
-            if response.ok:
-                return response.json()
-            else:
-                logging.warning(
-                    f"Failed to get repo stats for {owner}/{repo}: "
-                    f"{response.status_code}"
-                )
-                return {}
-        except Exception as e:
-            logging.error(f"Error fetching repo stats: {str(e)}")
-            return {}
-    def batch_get_repo_stats(self, repo_list: "list[str]") -> "dict[str, dict[str, Any]]":
-        """Get statistics for multiple repositories in a batch"""
-        self.check_rate_limit()
-        results: "dict[str, dict[str, Any]]" = {}
-        for repo_url in repo_list:
-            # Extract owner/repo from URL
-            match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
-            if not match:
-                continue
-            owner, repo = match.groups()
-            repo = repo.split(".")[0]  # Remove .git extension if present
-            # Get stats
-            stats = self.get_repo_stats(owner, repo)
-            results[repo_url] = stats
-            # Be nice to GitHub API
-            time.sleep(0.1)
-        return results
-class CrateAPIClient:
-    def __init__(self, config: PipelineConfig) -> None:
-        self.config = config
-        # Simple session without dependency on HTTPClientUtils
-        self.session = requests.Session()
-        self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.3.2"})
-    def fetch_crate_metadata(self, crate_name: str) -> "dict[str, Any] | None":
-        """Fetch metadata with retry logic"""
-        for attempt in range(self.config.max_retries):
-            try:
-                return self._fetch_metadata(crate_name)
-            except Exception as e:
-                logging.warning(
-                    f"Attempt {
-                        attempt +
-                        1} failed for {crate_name}: {
-                        str(e)}"
-                )
-                wait = 2**attempt
-                time.sleep(wait)
-        return None
-    def _fetch_metadata(self, crate_name: str) -> "dict[str, Any] | None":
-        """Enhanced metadata fetching that tries multiple sources"""
-        # First try crates.io (primary source)
-        try:
-            r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
-            if r.ok:
-                data = r.json()
-                crate_data = data["crate"]
-                latest = crate_data["newest_version"]
-                # Get readme
-                readme_response = self.session.get(
-                    f"https://crates.io/api/v1/crates/{crate_name}/readme"
-                )
-                readme = readme_response.text if readme_response.ok else ""
-                # Get dependencies
-                deps_url = (
-                    f"https://crates.io/api/v1/crates/{crate_name}/"
-                    f"{latest}/dependencies"
-                )
-                deps_response = self.session.get(deps_url)
-                deps: list[dict[str, Any]] = (
-                    deps_response.json().get("dependencies", [])
-                    if deps_response.ok
-                    else []
-                )
-                # Get features - using the versions endpoint
-                features = []
-                versions_response = self.session.get(
-                    f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
-                )
-                if versions_response.ok:
-                    version_data = versions_response.json().get("version", {})
-                    features_dict = version_data.get("features", {})
-                    features = [
-                        {"name": k, "dependencies": v} for k, v in features_dict.items()
-                    ]
-                # Repository info and GitHub stars
-                repo = crate_data.get("repository", "")
-                gh_stars = 0
-                # Check if it's a GitHub repo
-                if "github.com" in repo and self.config.github_token:
-                    match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
-                    if match:
-                        owner, repo_name = match.groups()
-                        repo_name = repo_name.split(".")[0]  # Handle .git extensions
-                        gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
-                        gh_headers: dict[str, str] = {}
-                        if self.config.github_token:
-                            gh_headers["Authorization"] = (
-                                f"token {self.config.github_token}"
-                            )
-                        gh = self.session.get(gh_url, headers=gh_headers)
-                        if gh.ok:
-                            gh_data = gh.json()
-                            gh_stars = gh_data.get("stargazers_count", 0)
-                # Check if it's hosted on lib.rs
-                lib_rs_data = {}
-                if "lib.rs" in repo:
-                    lib_rs_url = f"https://lib.rs/crates/{crate_name}"
-                    lib_rs_response = self.session.get(lib_rs_url)
-                    if lib_rs_response.ok:
-                        soup = BeautifulSoup(lib_rs_response.text, "html.parser")
-                        # Get README from lib.rs if not already available
-                        if not readme:
-                            readme_div = soup.find("div", class_="readme")
-                            if readme_div:
-                                readme = readme_div.get_text(
-                                    strip=True
-                                )  # Get lib.rs specific stats
-                        stats_div = soup.find("div", class_="crate-stats")
-                        if isinstance(stats_div, Tag):
-                            downloads_text = stats_div.find(
-                                string=re.compile(r"[\d,]+ downloads")
-                            )
-                            if downloads_text:
-                                lib_rs_data["librs_downloads"] = int(
-                                    re.sub(r"[^\d]", "", str(downloads_text))
-                                )
-                # Extract code snippets and sections (simplified)
-                code_snippets: list[str] = (
-                    []
-                )  # Simplified - would normally extract from readme
-                readme_sections: dict[str, str] = (
-                    {}
-                )  # Simplified - would normally parse sections
-                result: dict[str, Any] = {
-                    "name": crate_name,
-                    "version": latest,
-                    "description": crate_data.get("description", ""),
-                    "repository": repo,
-                    "keywords": crate_data.get("keywords", []),
-                    "categories": crate_data.get("categories", []),
-                    "readme": readme,
-                    "downloads": crate_data.get("downloads", 0),
-                    "github_stars": gh_stars,
-                    "dependencies": deps,
-                    "code_snippets": code_snippets,
-                    "features": features,
-                    "readme_sections": readme_sections,
-                    **lib_rs_data,
-                }
-                return result
-        except Exception as e:
-            logging.error(
-                f"Failed fetching metadata for {crate_name}: {
-                    str(e)}"
-            )
-            raise
-        # If crates.io fails, try lib.rs
-        try:
-            r = self.session.get(f"https://lib.rs/crates/{crate_name}")
-            if r.ok:
-                soup = BeautifulSoup(r.text, "html.parser")
-                # Extract metadata from lib.rs page
-                h1 = soup.select_one("h1")
-                name = h1.text.strip() if h1 else crate_name
-                # Find description
-                desc_elem = soup.select_one(".description")
-                description = desc_elem.text.strip() if desc_elem else ""
-                # Find repository link
-                repo_link: Union[str, None] = None
-                for a in soup.select("a"):
-                    href = a.get("href")
-                    if href and isinstance(href, str) and "github.com" in href:
-                        repo_link = href
-                        break
-                # Find keywords
-                keywords_elem = soup.select_one(".keywords")
-                keywords = (
-                    [k.text.strip() for k in keywords_elem.find_all("a")]
-                    if keywords_elem
-                    else []
-                )
-                # Basic metadata from lib.rs
-                return {
-                    "name": name,
-                    "version": "latest",  # lib.rs doesn't easily expose version
-                    "description": description,
-                    "repository": repo_link or "",
-                    "keywords": keywords,
-                    "categories": [],
-                    "readme": "",
-                    "downloads": 0,
-                    "github_stars": 0,
-                    "dependencies": [],
-                    "code_snippets": [],
-                    "features": [],
-                    "readme_sections": {},
-                    "source": "lib.rs",
-                }
-        except Exception:
-            pass
-        # Finally, try GitHub search
-        try:
-            # This is a simplification - GitHub's search API requires
-            # authentication
-            gh_search_headers: dict[str, str] = {}
-            if self.config.github_token:
-                gh_search_headers["Authorization"] = f"token {self.config.github_token}"
-            search_url = (
-                f"https://api.github.com/search/repositories?"
-                f"q={crate_name}+language:rust"
-            )
-            r = requests.get(search_url, headers=gh_search_headers)
-            if r.ok:
-                results = r.json().get("items", [])
-                if results:
-                    repo = results[0]  # Take first match
-                    # Basic metadata from GitHub
-                    return {
-                        "name": crate_name,
-                        "version": "unknown",
-                        "description": repo.get("description", ""),
-                        "repository": repo.get("html_url", ""),
-                        "keywords": [],
-                        "categories": [],
-                        "readme": "",
-                        "downloads": 0,
-                        "github_stars": repo.get("stargazers_count", 0),
-                        "dependencies": [],
-                        "code_snippets": [],
-                        "features": [],
-                        "readme_sections": {},
-                        "source": "github",
-                    }
-        except Exception:
-            pass
-        # If all sources fail
-        return None
+# network.py
+import os
+import sys
+import re
+import time
+import logging
+import requests
+from bs4 import BeautifulSoup, Tag
+from typing import Any, Union
+from .config import PipelineConfig
+# Import utilities
+# Add the parent directory to the path to import utils
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+class GitHubBatchClient:
+    def __init__(self, config: PipelineConfig) -> None:
+        self.config = config
+        # Simple headers without dependency on HTTPClientUtils
+        self.headers = {
+            "Accept": "application/vnd.github.v3+json",
+            "User-Agent": "SigilDERG-Data-Production/1.3.2",
+        }
+        if config.github_token:
+            self.headers["Authorization"] = f"token {config.github_token}"
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+        self.remaining_calls = 5000
+        self.reset_time = 0
+    def check_rate_limit(self) -> None:
+        """Check and update current rate limit status"""
+        try:
+            response = self.session.get(
+                "https://api.github.com/rate_limit", headers=self.headers
+            )
+            if response.ok:
+                data = response.json()
+                self.remaining_calls = data["resources"]["core"]["remaining"]
+                self.reset_time = data["resources"]["core"]["reset"]
+                if self.remaining_calls < 100:
+                    reset_in = self.reset_time - time.time()
+                    logging.warning(
+                        f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in / 60:.1f} minutes"
+                    )
+        except Exception:
+            pass
+    def get_repo_stats(self, owner: str, repo: str) -> "dict[str, Any]":
+        """Get repository statistics"""
+        try:
+            url = f"https://api.github.com/repos/{owner}/{repo}"
+            response = self.session.get(url, headers=self.headers)
+            if response.ok:
+                return response.json()
+            else:
+                logging.warning(
+                    f"Failed to get repo stats for {owner}/{repo}: {response.status_code}"
+                )
+                return {}
+        except Exception as e:
+            logging.error(f"Error fetching repo stats: {str(e)}")
+            return {}
+    def batch_get_repo_stats(self, repo_list: "list[str]") -> "dict[str, dict[str, Any]]":
+        """Get statistics for multiple repositories in a batch"""
+        self.check_rate_limit()
+        results: "dict[str, dict[str, Any]]" = {}
+        for repo_url in repo_list:
+            # Extract owner/repo from URL
+            match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
+            if not match:
+                continue
+            owner, repo = match.groups()
+            repo = repo.split(".")[0]  # Remove .git extension if present
+            # Get stats
+            stats = self.get_repo_stats(owner, repo)
+            results[repo_url] = stats
+            # Be nice to GitHub API
+            time.sleep(0.1)
+        return results
+class CrateAPIClient:
+    def __init__(self, config: PipelineConfig) -> None:
+        self.config = config
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.3.2"})
+    def fetch_crate_metadata(self, crate_name: str) -> "dict[str, Any] | None":
+        """Fetch metadata with retry logic"""
+        for attempt in range(self.config.max_retries):
+            try:
+                return self._fetch_metadata(crate_name)
+            except Exception as e:
+                logging.warning(
+                    f"Attempt {attempt + 1} failed for {crate_name}: {str(e)}"
+                )
+                wait = 2**attempt
+                time.sleep(wait)
+        return None
+    def _fetch_metadata(self, crate_name: str) -> "dict[str, Any] | None":
+        """Enhanced metadata fetching that tries multiple sources"""
+        # First try crates.io (primary source)
+        try:
+            r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
+            if r.ok:
+                data = r.json()
+                crate_data = data["crate"]
+                latest = crate_data["newest_version"]
+                # Get readme
+                readme_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/readme"
+                )
+                readme = readme_response.text if readme_response.ok else ""
+                # Get dependencies
+                deps_url = (
+                    f"https://crates.io/api/v1/crates/{crate_name}/"
+                    f"{latest}/dependencies"
+                )
+                deps_response = self.session.get(deps_url)
+                deps: list[dict[str, Any]] = (
+                    deps_response.json().get("dependencies", [])
+                    if deps_response.ok
+                    else []
+                )
+                # Get features - using the versions endpoint
+                features = []
+                versions_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
+                )
+                if versions_response.ok:
+                    version_data = versions_response.json().get("version", {})
+                    features_dict = version_data.get("features", {})
+                    features = [
+                        {"name": k, "dependencies": v} for k, v in features_dict.items()
+                    ]
+                # Repository info and GitHub stars
+                repo = crate_data.get("repository", "")
+                gh_stars = 0
+                # Check if it's a GitHub repo
+                if "github.com" in repo and self.config.github_token:
+                    match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
+                    if match:
+                        owner, repo_name = match.groups()
+                        repo_name = repo_name.split(".")[0]  # Handle .git extensions
+                        gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
+                        gh_headers: dict[str, str] = {}
+                        if self.config.github_token:
+                            gh_headers["Authorization"] = (
+                                f"token {self.config.github_token}"
+                            )
+                        gh = self.session.get(gh_url, headers=gh_headers)
+                        if gh.ok:
+                            gh_data = gh.json()
+                            gh_stars = gh_data.get("stargazers_count", 0)
+                # Check if it's hosted on lib.rs
+                lib_rs_data = {}
+                if "lib.rs" in repo:
+                    lib_rs_url = f"https://lib.rs/crates/{crate_name}"
+                    lib_rs_response = self.session.get(lib_rs_url)
+                    if lib_rs_response.ok:
+                        soup = BeautifulSoup(lib_rs_response.text, "html.parser")
+                        # Get README from lib.rs if not already available
+                        if not readme:
+                            readme_div = soup.find("div", class_="readme")
+                            if readme_div:
+                                readme = readme_div.get_text(
+                                    strip=True
+                                )  # Get lib.rs specific stats
+                        stats_div = soup.find("div", class_="crate-stats")
+                        if isinstance(stats_div, Tag):
+                            downloads_text = stats_div.find(
+                                string=re.compile(r"[\d,]+ downloads")
+                            )
+                            if downloads_text:
+                                lib_rs_data["librs_downloads"] = int(
+                                    re.sub(r"[^\d]", "", str(downloads_text))
+                                )
+                # Extract code snippets and sections (simplified)
+                code_snippets: list[str] = (
+                    []
+                )  # Simplified - would normally extract from readme
+                readme_sections: dict[str, str] = (
+                    {}
+                )  # Simplified - would normally parse sections
+                result: dict[str, Any] = {
+                    "name": crate_name,
+                    "version": latest,
+                    "description": crate_data.get("description", ""),
+                    "repository": repo,
+                    "keywords": crate_data.get("keywords", []),
+                    "categories": crate_data.get("categories", []),
+                    "readme": readme,
+                    "downloads": crate_data.get("downloads", 0),
+                    "github_stars": gh_stars,
+                    "dependencies": deps,
+                    "code_snippets": code_snippets,
+                    "features": features,
+                    "readme_sections": readme_sections,
+                    **lib_rs_data,
+                }
+                return result
+        except Exception as e:
+            logging.error(
+                f"Failed fetching metadata for {crate_name}: {str(e)}"
+            )
+            raise
+        # If crates.io fails, try lib.rs
+        try:
+            r = self.session.get(f"https://lib.rs/crates/{crate_name}")
+            if r.ok:
+                soup = BeautifulSoup(r.text, "html.parser")
+                # Extract metadata from lib.rs page
+                h1 = soup.select_one("h1")
+                name = h1.text.strip() if h1 else crate_name
+                # Find description
+                desc_elem = soup.select_one(".description")
+                description = desc_elem.text.strip() if desc_elem else ""
+                # Find repository link
+                repo_link: Union[str, None] = None
+                for a in soup.select("a"):
+                    href = a.get("href")
+                    if href and isinstance(href, str) and "github.com" in href:
+                        repo_link = href
+                        break
+                # Find keywords
+                keywords_elem = soup.select_one(".keywords")
+                keywords = (
+                    [k.text.strip() for k in keywords_elem.find_all("a")]
+                    if keywords_elem
+                    else []
+                )
+                # Basic metadata from lib.rs
+                return {
+                    "name": name,
+                    "version": "latest",  # lib.rs doesn't easily expose version
+                    "description": description,
+                    "repository": repo_link or "",
+                    "keywords": keywords,
+                    "categories": [],
+                    "readme": "",
+                    "downloads": 0,
+                    "github_stars": 0,
+                    "dependencies": [],
+                    "code_snippets": [],
+                    "features": [],
+                    "readme_sections": {},
+                    "source": "lib.rs",
+                }
+        except Exception:
+            pass
+        # Finally, try GitHub search
+        try:
+            # This is a simplification - GitHub's search API requires
+            # authentication
+            gh_search_headers: dict[str, str] = {}
+            if self.config.github_token:
+                gh_search_headers["Authorization"] = f"token {self.config.github_token}"
+            search_url = (
+                f"https://api.github.com/search/repositories?"
+                f"q={crate_name}+language:rust"
+            )
+            r = requests.get(search_url, headers=gh_search_headers)
+            if r.ok:
+                results = r.json().get("items", [])
+                if results:
+                    repo = results[0]  # Take first match
+                    # Basic metadata from GitHub
+                    return {
+                        "name": crate_name,
+                        "version": "unknown",
+                        "description": repo.get("description", ""),
+                        "repository": repo.get("html_url", ""),
+                        "keywords": [],
+                        "categories": [],
+                        "readme": "",
+                        "downloads": 0,
+                        "github_stars": repo.get("stargazers_count", 0),
+                        "dependencies": [],
+                        "code_snippets": [],
+                        "features": [],
+                        "readme_sections": {},
+                        "source": "github",
+                    }
+        except Exception:
+            pass
+        # If all sources fail
+        return None

rust_crate_pipeline/progress_monitor.py ADDED Viewed

@@ -0,0 +1,334 @@
+# progress_monitor.py
+"""
+Real-time progress monitoring for the Rust Crate Pipeline (CLI-only).
+This module provides:
+- Live progress bars with ETA
+- Real-time statistics and metrics
+- Status printouts
+- Performance monitoring
+- Error tracking and reporting
+- Status JSON file for external tools/scripts
+"""
+import time
+import threading
+import json
+import os
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Any, Union
+from dataclasses import dataclass, field
+from collections import deque
+import logging
+try:
+    from tqdm import tqdm
+    TQDM_AVAILABLE = True
+except ImportError:
+    TQDM_AVAILABLE = False
+try:
+    import psutil
+    PSUTIL_AVAILABLE = True
+except ImportError:
+    PSUTIL_AVAILABLE = False
+@dataclass
+class PipelineMetrics:
+    """Real-time pipeline metrics and statistics."""
+    total_crates: int = 0
+    processed_crates: int = 0
+    successful_crates: int = 0
+    failed_crates: int = 0
+    skipped_crates: int = 0
+    current_batch: int = 0
+    total_batches: int = 0
+    start_time: Optional[datetime] = None
+    current_operation: str = "Initializing"
+    errors: List[Dict[str, Any]] = field(default_factory=list)
+    warnings: List[Dict[str, Any]] = field(default_factory=list)
+    performance_stats: Dict[str, Any] = field(default_factory=dict)
+    @property
+    def progress_percentage(self) -> float:
+        """Calculate progress percentage."""
+        if self.total_crates == 0:
+            return 0.0
+        return (self.processed_crates / self.total_crates) * 100
+    @property
+    def success_rate(self) -> float:
+        """Calculate success rate percentage."""
+        if self.processed_crates == 0:
+            return 0.0
+        return (self.successful_crates / self.processed_crates) * 100
+    @property
+    def elapsed_time(self) -> timedelta:
+        """Calculate elapsed time."""
+        if not self.start_time:
+            return timedelta(0)
+        return datetime.now() - self.start_time
+    @property
+    def estimated_completion(self) -> Optional[datetime]:
+        """Estimate completion time."""
+        if self.processed_crates == 0 or not self.start_time:
+            return None
+        avg_time_per_crate = self.elapsed_time / self.processed_crates
+        remaining_crates = self.total_crates - self.processed_crates
+        estimated_remaining = avg_time_per_crate * remaining_crates
+        return datetime.now() + estimated_remaining
+class ProgressMonitor:
+    """Real-time progress monitoring with live dashboard."""
+    def __init__(self, total_crates: int, output_dir: str = "output"):
+        self.metrics = PipelineMetrics(total_crates=total_crates)
+        self.output_dir = output_dir
+        self.logger = logging.getLogger(__name__)
+        # Performance tracking
+        self.crate_times: deque = deque(maxlen=100)  # Last 100 crate processing times
+        self.batch_times: deque = deque(maxlen=50)   # Last 50 batch processing times
+        # Status tracking
+        self.current_crate: Optional[str] = None
+        self.current_operation: str = "Initializing"
+        self.status_file = os.path.join(output_dir, "pipeline_status.json")
+        # Thread safety
+        self._lock = threading.Lock()
+        # Initialize
+        self.metrics.start_time = datetime.now()
+        self._save_status()
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+    def start_crate(self, crate_name: str) -> None:
+        """Mark the start of processing a crate."""
+        with self._lock:
+            self.current_crate = crate_name
+            self.current_operation = f"Processing {crate_name}"
+            self.metrics.current_operation = self.current_operation
+            self._save_status()
+    def complete_crate(self, crate_name: str, success: bool = True,
+                      processing_time: Optional[float] = None) -> None:
+        """Mark the completion of processing a crate."""
+        with self._lock:
+            self.metrics.processed_crates += 1
+            if success:
+                self.metrics.successful_crates += 1
+            else:
+                self.metrics.failed_crates += 1
+            if processing_time:
+                self.crate_times.append(processing_time)
+            self.current_crate = None
+            self.current_operation = "Waiting for next crate"
+            self.metrics.current_operation = self.current_operation
+            # Update performance stats
+            self._update_performance_stats()
+            self._save_status()
+    def skip_crate(self, crate_name: str, reason: str = "Unknown") -> None:
+        """Mark a crate as skipped."""
+        with self._lock:
+            self.metrics.processed_crates += 1
+            self.metrics.skipped_crates += 1
+            self.metrics.warnings.append({
+                "crate": crate_name,
+                "reason": reason,
+                "timestamp": datetime.now().isoformat()
+            })
+            self._save_status()
+    def start_batch(self, batch_num: int, batch_size: int) -> None:
+        """Mark the start of processing a batch."""
+        with self._lock:
+            self.metrics.current_batch = batch_num
+            self.current_operation = f"Processing batch {batch_num}"
+            self.metrics.current_operation = self.current_operation
+            self._save_status()
+    def complete_batch(self, batch_num: int, processing_time: Optional[float] = None) -> None:
+        """Mark the completion of processing a batch."""
+        with self._lock:
+            if processing_time:
+                self.batch_times.append(processing_time)
+            self.current_operation = "Batch completed, preparing next batch"
+            self.metrics.current_operation = self.current_operation
+            self._save_status()
+    def add_error(self, crate_name: str, error: str, error_type: str = "Processing") -> None:
+        """Add an error to the metrics."""
+        with self._lock:
+            self.metrics.errors.append({
+                "crate": crate_name,
+                "error": error,
+                "type": error_type,
+                "timestamp": datetime.now().isoformat()
+            })
+            self._save_status()
+    def add_warning(self, crate_name: str, warning: str) -> None:
+        """Add a warning to the metrics."""
+        with self._lock:
+            self.metrics.warnings.append({
+                "crate": crate_name,
+                "warning": warning,
+                "timestamp": datetime.now().isoformat()
+            })
+            self._save_status()
+    def update_operation(self, operation: str) -> None:
+        """Update the current operation description."""
+        with self._lock:
+            self.current_operation = operation
+            self.metrics.current_operation = operation
+            self._save_status()
+    def _update_performance_stats(self) -> None:
+        """Update performance statistics."""
+        if self.crate_times:
+            self.metrics.performance_stats.update({
+                "avg_crate_time": sum(self.crate_times) / len(self.crate_times),
+                "min_crate_time": min(self.crate_times),
+                "max_crate_time": max(self.crate_times),
+                "crates_per_minute": len(self.crate_times) / (sum(self.crate_times) / 60)
+            })
+        if self.batch_times:
+            self.metrics.performance_stats.update({
+                "avg_batch_time": sum(self.batch_times) / len(self.batch_times),
+                "min_batch_time": min(self.batch_times),
+                "max_batch_time": max(self.batch_times)
+            })
+        # System stats if available
+        if PSUTIL_AVAILABLE:
+            try:
+                cpu_percent = psutil.cpu_percent()
+                memory = psutil.virtual_memory()
+                disk = psutil.disk_usage(self.output_dir)
+                self.metrics.performance_stats.update({
+                    "system_cpu_percent": cpu_percent,
+                    "system_memory_percent": memory.percent,
+                    "system_disk_percent": disk.percent,
+                    "system_memory_available": memory.available,
+                    "system_disk_free": disk.free
+                })
+            except Exception as e:
+                self.logger.warning(f"Failed to get system stats: {e}")
+    def _save_status(self) -> None:
+        """Save current status to file."""
+        try:
+            status_data = {
+                "metrics": {
+                    "total_crates": self.metrics.total_crates,
+                    "processed_crates": self.metrics.processed_crates,
+                    "successful_crates": self.metrics.successful_crates,
+                    "failed_crates": self.metrics.failed_crates,
+                    "skipped_crates": self.metrics.skipped_crates,
+                    "progress_percentage": self.metrics.progress_percentage,
+                    "success_rate": self.metrics.success_rate,
+                    "current_batch": self.metrics.current_batch,
+                    "total_batches": self.metrics.total_batches,
+                    "start_time": self.metrics.start_time.isoformat() if self.metrics.start_time else None,
+                    "elapsed_time": str(self.metrics.elapsed_time),
+                    "estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
+                    "current_operation": self.metrics.current_operation
+                },
+                "current_crate": self.current_crate,
+                "performance_stats": self.metrics.performance_stats,
+                "errors": self.metrics.errors[-10:],  # Last 10 errors
+                "warnings": self.metrics.warnings[-10:],  # Last 10 warnings
+                "last_updated": datetime.now().isoformat()
+            }
+            with open(self.status_file, 'w') as f:
+                json.dump(status_data, f, indent=2)
+        except Exception as e:
+            self.logger.error(f"Failed to save status: {e}")
+    def get_status_summary(self) -> Dict[str, Any]:
+        """Get a summary of current status."""
+        with self._lock:
+            return {
+                "progress": f"{self.metrics.progress_percentage:.1f}%",
+                "processed": f"{self.metrics.processed_crates}/{self.metrics.total_crates}",
+                "success_rate": f"{self.metrics.success_rate:.1f}%",
+                "elapsed_time": str(self.metrics.elapsed_time),
+                "estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
+                "current_operation": self.current_operation,
+                "current_crate": self.current_crate,
+                "errors_count": len(self.metrics.errors),
+                "warnings_count": len(self.metrics.warnings)
+            }
+    def print_status(self) -> None:
+        """Print current status to console."""
+        summary = self.get_status_summary()
+        print("\n" + "="*80)
+        print("🚀 RUST CRATE PIPELINE - REAL-TIME STATUS")
+        print("="*80)
+        print(f"📊 Progress: {summary['progress']} ({summary['processed']} crates)")
+        print(f"✅ Success Rate: {summary['success_rate']}")
+        print(f"⏱️  Elapsed Time: {summary['elapsed_time']}")
+        if summary['estimated_completion']:
+            print(f"🎯 Estimated Completion: {summary['estimated_completion']}")
+        print(f"🔄 Current Operation: {summary['current_operation']}")
+        if summary['current_crate']:
+            print(f"📦 Current Crate: {summary['current_crate']}")
+        print(f"❌ Errors: {summary['errors_count']}")
+        print(f"⚠️  Warnings: {summary['warnings_count']}")
+        # Performance stats
+        if self.metrics.performance_stats:
+            stats = self.metrics.performance_stats
+            if 'avg_crate_time' in stats:
+                print(f"⚡ Avg Crate Time: {stats['avg_crate_time']:.2f}s")
+            if 'crates_per_minute' in stats:
+                print(f"🚀 Processing Rate: {stats['crates_per_minute']:.1f} crates/min")
+            if 'system_cpu_percent' in stats:
+                print(f"💻 System CPU: {stats['system_cpu_percent']:.1f}%")
+            if 'system_memory_percent' in stats:
+                print(f"🧠 System Memory: {stats['system_memory_percent']:.1f}%")
+        print("="*80)
+    def create_progress_bar(self, desc: str = "Processing crates") -> Optional[Any]:
+        """Create a progress bar if tqdm is available."""
+        if not TQDM_AVAILABLE:
+            return None
+        return tqdm(
+            total=self.metrics.total_crates,
+            desc=desc,
+            unit="crate",
+            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
+        )
+def create_monitor(total_crates: int, output_dir: str = "output") -> ProgressMonitor:
+    """Create and configure a CLI-only progress monitor."""
+    monitor = ProgressMonitor(total_crates, output_dir)
+    print("✅ Real-time CLI progress monitoring enabled")
+    return monitor

rust_crate_pipeline/version.py CHANGED Viewed

@@ -1,74 +1,74 @@
-from typing import Dict, List, Tuple, Optional, Any
-"""Version information for rust-crate-pipeline."""
-__version__ = "1.3.2"
-__version_info__ = tuple(int(x) for x in __version__.split("-")[0].split("."))
-# Version history
-# 1.2.5-dev.20250621 - Dev branch: experimental, not a formal
-# release. Originated from v1.2.5.
-# 1.2.5 - Last official release.
-# 1.5.1 - Configuration Standardization Release: Model Path Consistency
-#         - Standardized all configuration to use GGUF model paths
-#         - Updated CLI defaults for --crawl4ai-model to
-#           ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
-#         - Enhanced Rule Zero alignment with transparent configuration practices
-#         - Updated all test files to use consistent GGUF model path references
-#         - Comprehensive documentation updates for proper model configuration
-#         - Removed inconsistent Ollama references in favor of llama-cpp-python
-#         - Ensured CLI help text and JSON examples reflect correct model paths
-# 1.5.0 - Major Release: Enhanced Web Scraping with Crawl4AI Integration
-#         - Integrated Crawl4AI for advanced web scraping capabilities
-#         - Added JavaScript-rendered content extraction via Playwright
-#         - Enhanced README parsing with LLM-powered content analysis
-#         - New CLI options: --enable-crawl4ai, --disable-crawl4ai, --crawl4ai-model
-#         - Enhanced configuration with local GGUF model paths and crawl4ai_timeout
-#         - Comprehensive test coverage for all Crawl4AI features
-#         - Rule Zero compliant with full transparency and audit trails
-# 1.4.0 - Major Release: Rule Zero Compliance Audit Complete
-#         - Completed comprehensive Rule Zero alignment audit
-#         - Eliminated all code redundancy and dead code
-#         - Achieved 100% test coverage (22/22 tests passing)
-#         - Refactored to pure asyncio architecture (thread-free)
-#         - Suppressed Pydantic deprecation warnings
-#         - Full production readiness with Docker support
-#         - Enhanced documentation with PyPI cross-references
-#         - Certified Rule Zero compliance across all four principles
-# 1.3.1 - Bug Fix Release: Crawl4AI Integration Cleanup
-#         - Fixed CSS selector syntax errors in Crawl4AI integration
-#         - Cleaned up duplicate and obsolete test files
-#         - Resolved import conflicts between workspace and integration configs
-#         - Improved error handling in enhanced scraping module
-#         - Standardized on direct llama.cpp approach (removed Ollama dependencies)
-#         - Enhanced Rule Zero compliance with transparent cleanup process
-#         - Fixed type annotation compatibility issues
-#         - Fixed Python 3.9 compatibility for type annotations
-#         - Updated dict[str, Any] to "dict[str, Any]" format
-#         - Fixed Union type expressions in conditional imports
-#         - Resolved IDE linter errors in network.py, pipeline.py, and production_config.py
-#         - Improved code quality and maintainability
-# 1.3.0 - Quality & Integration Release: Comprehensive code quality improvements
-#         - Fixed all critical PEP 8 violations (F821, F811, E114)
-#         - Enhanced error handling with graceful dependency fallbacks
-#         - Improved module integration and import path resolution
-#         - Added comprehensive test validation (21/21 tests passing)
-#         - Enhanced async support and Unicode handling
-#         - Production-ready CLI interfaces with robust error handling
-#         - Full Rule Zero compliance validation
-# 1.2.0 - Major release: Production-ready, cleaned codebase
-#         - Unified documentation into single comprehensive README
-#         - Removed all non-essential development and test files
-#         - Optimized for PyPI distribution and Docker deployment
-#         - Enhanced GitHub token integration and setup
-# 1.1.2 - Production release: Cleaned up non-essential files
-#         - Unified documentation into single README
-#         - Optimized for PyPI distribution
-# 1.1.1 - Bug fix: Added missing python-dateutil dependency
-#         - Fixed relativedelta import error
-# 1.1.0 - Updated author and contact information
-#         - Enhanced package configuration
-# 0.1.0 - Initial release
-#         - Core pipeline functionality
-#         - AI-powered metadata enrichment
-#         - Dependency analysis
-#         - PyPI package setup
+from typing import Dict, List, Tuple, Optional, Any
+"""Version information for rust-crate-pipeline."""
+__version__ = "1.3.4"
+__version_info__ = tuple(int(x) for x in __version__.split("-")[0].split("."))
+# Version history
+# 1.2.5-dev.20250621 - Dev branch: experimental, not a formal
+# release. Originated from v1.2.5.
+# 1.2.5 - Last official release.
+# 1.5.1 - Configuration Standardization Release: Model Path Consistency
+#         - Standardized all configuration to use GGUF model paths
+#         - Updated CLI defaults for --crawl4ai-model to
+#           ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
+#         - Enhanced Rule Zero alignment with transparent configuration practices
+#         - Updated all test files to use consistent GGUF model path references
+#         - Comprehensive documentation updates for proper model configuration
+#         - Removed inconsistent Ollama references in favor of llama-cpp-python
+#         - Ensured CLI help text and JSON examples reflect correct model paths
+# 1.5.0 - Major Release: Enhanced Web Scraping with Crawl4AI Integration
+#         - Integrated Crawl4AI for advanced web scraping capabilities
+#         - Added JavaScript-rendered content extraction via Playwright
+#         - Enhanced README parsing with LLM-powered content analysis
+#         - New CLI options: --enable-crawl4ai, --disable-crawl4ai, --crawl4ai-model
+#         - Enhanced configuration with local GGUF model paths and crawl4ai_timeout
+#         - Comprehensive test coverage for all Crawl4AI features
+#         - Rule Zero compliant with full transparency and audit trails
+# 1.4.0 - Major Release: Rule Zero Compliance Audit Complete
+#         - Completed comprehensive Rule Zero alignment audit
+#         - Eliminated all code redundancy and dead code
+#         - Achieved 100% test coverage (22/22 tests passing)
+#         - Refactored to pure asyncio architecture (thread-free)
+#         - Suppressed Pydantic deprecation warnings
+#         - Full production readiness with Docker support
+#         - Enhanced documentation with PyPI cross-references
+#         - Certified Rule Zero compliance across all four principles
+# 1.3.1 - Bug Fix Release: Crawl4AI Integration Cleanup
+#         - Fixed CSS selector syntax errors in Crawl4AI integration
+#         - Cleaned up duplicate and obsolete test files
+#         - Resolved import conflicts between workspace and integration configs
+#         - Improved error handling in enhanced scraping module
+#         - Standardized on direct llama.cpp approach (removed Ollama dependencies)
+#         - Enhanced Rule Zero compliance with transparent cleanup process
+#         - Fixed type annotation compatibility issues
+#         - Fixed Python 3.9 compatibility for type annotations
+#         - Updated dict[str, Any] to "dict[str, Any]" format
+#         - Fixed Union type expressions in conditional imports
+#         - Resolved IDE linter errors in network.py, pipeline.py, and production_config.py
+#         - Improved code quality and maintainability
+# 1.3.0 - Quality & Integration Release: Comprehensive code quality improvements
+#         - Fixed all critical PEP 8 violations (F821, F811, E114)
+#         - Enhanced error handling with graceful dependency fallbacks
+#         - Improved module integration and import path resolution
+#         - Added comprehensive test validation (21/21 tests passing)
+#         - Enhanced async support and Unicode handling
+#         - Production-ready CLI interfaces with robust error handling
+#         - Full Rule Zero compliance validation
+# 1.2.0 - Major release: Production-ready, cleaned codebase
+#         - Unified documentation into single comprehensive README
+#         - Removed all non-essential development and test files
+#         - Optimized for PyPI distribution and Docker deployment
+#         - Enhanced GitHub token integration and setup
+# 1.1.2 - Production release: Cleaned up non-essential files
+#         - Unified documentation into single README
+#         - Optimized for PyPI distribution
+# 1.1.1 - Bug fix: Added missing python-dateutil dependency
+#         - Fixed relativedelta import error
+# 1.1.0 - Updated author and contact information
+#         - Enhanced package configuration
+# 0.1.0 - Initial release
+#         - Core pipeline functionality
+#         - AI-powered metadata enrichment
+#         - Dependency analysis
+#         - PyPI package setup

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rust-crate-pipeline
-Version: 1.3.2
+Version: 1.3.4
 Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
 Home-page: https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
 Author: SuperUser666-Sigil

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/RECORD RENAMED Viewed

@@ -8,12 +8,13 @@ rust_crate_pipeline/crate_analysis.py,sha256=GsoXemJ9VFyAbb4Sm5gY5ToTqNtOA4pI38A
 rust_crate_pipeline/crate_list.txt,sha256=W3NxDtxvihyKp9SN85FYXX6p8Hh49IFih1M4-c-CynM,4334
 rust_crate_pipeline/github_token_checker.py,sha256=COXXS9uoLV9WYIcT02C-bV5uH3fa9D9HJImc07vMjLs,3766
 rust_crate_pipeline/main.py,sha256=iGYEAYvXkoFFvaA6DIVGiUL3wLhiCzatB6Fvf-Yrj2A,18858
-rust_crate_pipeline/network.py,sha256=SSSolG5QdK4m9E77Ko3m-8DM1xz1Ha9XP9yeLSCfuaU,13308
+rust_crate_pipeline/network.py,sha256=khyjfOplaDvMxLWGB-JbPQnc27ZfozKGYBFw2b3BScM,12834
 rust_crate_pipeline/pipeline.py,sha256=YN6PEhg0Si_oo6-Wtm_PviytzWzpQupTPC2e4L1F7XE,16349
 rust_crate_pipeline/production_config.py,sha256=uWylP9AIZZx7-9aT4sFmAKEEW9miJDxaiek8VE6WP-0,2372
+rust_crate_pipeline/progress_monitor.py,sha256=5K9KP-Xggi1JEINfRmq2W-wGUHtNIBTcocpDtB1t8iM,13743
 rust_crate_pipeline/unified_llm_processor.py,sha256=eo7KotNuqwc7_hgpFm18QLokFoufFslnvi8TnDsSYEg,25064
 rust_crate_pipeline/unified_pipeline.py,sha256=2yglmXVlQfSkVq0HVTPonDee6VxWaQWZw0X2l4lLBGw,23704
-rust_crate_pipeline/version.py,sha256=f9QzOtJjGR2-83eFezB55H6KgfM0Gi5XAPOooI5Y5_M,4489
+rust_crate_pipeline/version.py,sha256=tEnYb8C6sG13xp0Nfedzjn8sJo94OXAq0bfReOnGYDY,4415
 rust_crate_pipeline/core/__init__.py,sha256=Sq4HWdANGqoYln7JdCog7m3BsGeR3tHdseeflvNetoQ,509
 rust_crate_pipeline/core/canon_registry.py,sha256=36tmt_wU6-kSyZnGfh53N64C7E3G-QR7GFbr9epj4zg,4700
 rust_crate_pipeline/core/irl_engine.py,sha256=QRZUdkN24W9XutLkj8JDplEz6FmnquUrwKsl0s2zRr4,10491
@@ -22,9 +23,9 @@ rust_crate_pipeline/scraping/__init__.py,sha256=ySkTRg7nIxgcbHJQ3L1XzcrOo281NZu0
 rust_crate_pipeline/scraping/unified_scraper.py,sha256=ZE2gkc0vQ3BOLdSX_IV-kMe8QAm2Av4M7VqpkxEKyT4,9965
 rust_crate_pipeline/utils/file_utils.py,sha256=tMaCPy7ghs9x4Hxu_sviX8MXU2sBjNvohUrvt4MejoM,2853
 rust_crate_pipeline/utils/logging_utils.py,sha256=e5jG0Yd6k3exgAdbVca46kWADJ_Qz8UJ3yEJzwTqPyI,2452
-rust_crate_pipeline-1.3.2.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
-rust_crate_pipeline-1.3.2.dist-info/METADATA,sha256=sMYLvaOvjRgZNP1iHuNb0NEWxgqY_uqzdx_PGnWtMT0,11254
-rust_crate_pipeline-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-rust_crate_pipeline-1.3.2.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
-rust_crate_pipeline-1.3.2.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
-rust_crate_pipeline-1.3.2.dist-info/RECORD,,
+rust_crate_pipeline-1.3.4.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
+rust_crate_pipeline-1.3.4.dist-info/METADATA,sha256=HS2WqdbGdgq5XaNm2RlIXXk0qLsy-H7MwtiKJyz7hsc,11254
+rust_crate_pipeline-1.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+rust_crate_pipeline-1.3.4.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
+rust_crate_pipeline-1.3.4.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
+rust_crate_pipeline-1.3.4.dist-info/RECORD,,

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{rust_crate_pipeline-1.3.2.dist-info → rust_crate_pipeline-1.3.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

rust-crate-pipeline 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

rust-crate-pipeline 1.3.2py3-none-any.whl → 1.3.4py3-none-any.whl