PyPI - rust-crate-pipeline - Versions diffs - 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

rust-crate-pipeline 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

rust_crate_pipeline/__init__.py +18 -27
rust_crate_pipeline/__main__.py +1 -0
rust_crate_pipeline/ai_processing.py +718 -596
rust_crate_pipeline/analysis.py +330 -363
rust_crate_pipeline/azure_ai_processing.py +462 -0
rust_crate_pipeline/config.py +46 -28
rust_crate_pipeline/core/__init__.py +19 -0
rust_crate_pipeline/core/canon_registry.py +133 -0
rust_crate_pipeline/core/irl_engine.py +256 -0
rust_crate_pipeline/core/sacred_chain.py +117 -0
rust_crate_pipeline/crate_analysis.py +54 -0
rust_crate_pipeline/crate_list.txt +424 -0
rust_crate_pipeline/github_token_checker.py +108 -112
rust_crate_pipeline/main.py +329 -109
rust_crate_pipeline/network.py +317 -308
rust_crate_pipeline/pipeline.py +300 -375
rust_crate_pipeline/production_config.py +24 -27
rust_crate_pipeline/progress_monitor.py +334 -0
rust_crate_pipeline/scraping/__init__.py +13 -0
rust_crate_pipeline/scraping/unified_scraper.py +259 -0
rust_crate_pipeline/unified_llm_processor.py +637 -0
rust_crate_pipeline/unified_pipeline.py +548 -0
rust_crate_pipeline/utils/file_utils.py +32 -5
rust_crate_pipeline/utils/logging_utils.py +21 -16
rust_crate_pipeline/version.py +76 -47
rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/pipeline.py CHANGED Viewed

@@ -4,462 +4,387 @@ import time
 import logging
 import json
 import asyncio
-from typing import List, Dict, Optional
+from typing import Any, Union, TYPE_CHECKING
+if TYPE_CHECKING:
+    from typing import Dict, List, Optional
 from .config import PipelineConfig, CrateMetadata, EnrichedCrate
 from .network import CrateAPIClient, GitHubBatchClient
 from .ai_processing import LLMEnricher
-from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
+from .analysis import DependencyAnalyzer
+from .crate_analysis import CrateAnalyzer
+# Import Azure OpenAI enricher
+try:
+    from .azure_ai_processing import AzureOpenAIEnricher
+    AZURE_OPENAI_AVAILABLE = True
+except ImportError:
+    AZURE_OPENAI_AVAILABLE = False
+    AzureOpenAIEnricher = None
 # Import enhanced scraping capabilities
 try:
-    import sys
-    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
-    from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
-    enhanced_scraping_available = True
+    from .scraping.unified_scraper import UnifiedScraper, ScrapingResult
+    ENHANCED_SCRAPING_AVAILABLE = True
 except ImportError:
-    enhanced_scraping_available = False
-    CrateDocumentationScraper = None
-    EnhancedScrapingResult = None
+    ENHANCED_SCRAPING_AVAILABLE = False
+    UnifiedScraper = None  # type: ignore[assignment,misc]
+    ScrapingResult = None  # type: ignore[assignment,misc]
     logging.warning("Enhanced scraping not available - using basic methods")
 class CrateDataPipeline:
-    def __init__(self, config: PipelineConfig):
+    """Orchestrates the entire data collection, enrichment, and analysis pipeline."""
+    def __init__(self, config: PipelineConfig) -> None:
         self.config = config
         self.api_client = CrateAPIClient(config)
         self.github_client = GitHubBatchClient(config)
-        self.enricher = LLMEnricher(config)
-        self.crates = self.get_crate_list()
-        self.output_dir = self._create_output_dir()        # Initialize enhanced scraping if available
-        self.enhanced_scraper = None
-        if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
-            try:
-                self.enhanced_scraper = CrateDocumentationScraper(
-                    enable_crawl4ai=config.enable_crawl4ai)
-                logging.info("✅ Enhanced scraping with Crawl4AI enabled")
-            except Exception as e:
-                logging.warning(
-                    f"❌ Failed to initialize enhanced scraping: {e}")
-        elif enhanced_scraping_available and CrateDocumentationScraper is not None:
+        # Initialize the appropriate AI enricher based on configuration
+        if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
             try:
-                self.enhanced_scraper = CrateDocumentationScraper(
-                    enable_crawl4ai=True)
-                logging.info(
-                    "✅ Enhanced scraping with Crawl4AI enabled (default)")
+                self.enricher = AzureOpenAIEnricher(config)
+                logging.info("[OK] Using Azure OpenAI enricher")
             except Exception as e:
-                logging.warning(
-                    f"❌ Failed to initialize enhanced scraping: {e}")
+                logging.warning(f"[WARN] Failed to initialize Azure OpenAI enricher: {e}")
+                logging.info("[INFO] Falling back to local LLM enricher")
+                self.enricher = LLMEnricher(config)
+        else:
+            if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
+                logging.warning("[WARN] Azure OpenAI requested but not available")
+            self.enricher = LLMEnricher(config)
+            logging.info("[OK] Using local LLM enricher")
+        # Initialize cargo analyzer
+        self.cargo_analyzer = CrateAnalyzer(".")
+        self.crates = self._get_crate_list()
+        self.output_dir = self._create_output_dir()
+        self.enhanced_scraper: Any = (
+            self._initialize_enhanced_scraper()
+        )
+    def _initialize_enhanced_scraper(self) -> Any:
+        """Initializes the CrateDocumentationScraper if available and enabled."""
+        if (
+            not ENHANCED_SCRAPING_AVAILABLE
+            or not self.config.enable_crawl4ai
+            or UnifiedScraper is None
+        ):
+            return None
+        try:
+            scraper = UnifiedScraper()
+            logging.info("[OK] Enhanced scraping with Crawl4AI enabled")
+            return scraper
+        except Exception as e:
+            logging.warning(f"[ERROR] Failed to initialize enhanced scraping: {e}")
+            return None
     def _create_output_dir(self) -> str:
+        """Creates a timestamped output directory for pipeline results."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
-        output_dir = f"crate_data_{timestamp}"
+        output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
         os.makedirs(output_dir, exist_ok=True)
         return output_dir
-    def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
-        """Return a comprehensive list of all high-value crates to process"""
-        crates = [
-            # Web frameworks and servers
-            "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
-            "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
-            "tiny_http", "httptest", "mockito", "wiremock",
-            # Async runtimes and utilities
-            "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
-            "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
-            "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
-            "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
-            # Serialization/deserialization
-            "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
-            "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
-            "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
-            # Error handling and debugging
-            "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
-            "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
-            # Command line and terminal
-            "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
-            "crossterm", "termion", "console", "indicati", "dialoguer", "termcolor",
-            "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
-            # Utilities and general purpose
-            "rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
-            "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
-            "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
-            "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
-            "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
-            "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
-            # HTTP clients and servers
-            "reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
-            "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
-            "webpki", "webpki-roots",
-            # Database and storage
-            "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
-            "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
-            "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
-            # Concurrency and parallelism
-            "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
-            "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
-            "flume", "kanal", "tokio-util", "futures-concurrency",
-            # Protocol buffers, gRPC, and messaging
-            "prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
-            "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
-            # Procedural macros and metaprogramming
-            "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
-            "darling", "derive_builder", "strum", "strum_macros",
-            "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
-            # Cryptography and security
-            "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
-            "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
-            "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
-            "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
-            # Game development and graphics
-            "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
-            "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
-            "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
-            # Networking and protocols
-            "socket2", "mio", "polling", "async-io", "calloop", "quinn",
-            "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
-            "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
-            # Text processing and parsing
-            "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
-            "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
-            "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
-            # System programming and OS interfaces
-            "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
-            "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
-            "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
-            # Testing and development tools
-            "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
-            "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
-            "insta", "goldenfile", "similar", "difference", "pretty_assertions",
-            # Configuration and environment
-            "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
-            "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
-            # Logging and observability
-            "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
-            "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
-            "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
-            # Time and date
-            "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
-            "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
-            # Machine Learning & AI
-            "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
-            "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
-            "tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
-            "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
-            "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
-            "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
-            "onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
-            "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
-            "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
-            "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
-            "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
-        ]
-        if limit is not None:
-            return crates[:limit]
-        return crates
-    async def fetch_metadata_batch(
-            self,
-            crate_names: List[str]) -> List[CrateMetadata]:
-        """Fetch metadata for a batch of crates using asyncio-based parallel processing
+    def _get_crate_list(self) -> "List[str]":
+        """
+        Loads the list of crates to process from an external file.
+        This approach is more modular and easier to maintain than a hardcoded list.
+        """
+        crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
+        try:
+            with open(crate_list_path) as f:
+                crates = [line.strip() for line in f if line.strip()]
+            logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
+            if not crates:
+                logging.warning(f"Crate list at {crate_list_path} is empty.")
+            return crates
+        except FileNotFoundError:
+            logging.error(f"Crate list file not found at: {crate_list_path}")
+            return []
+    def get_crate_list(self) -> "List[str]":
+        """
+        Public method to get the list of crates.
+        Returns the already loaded crate list or loads it if not available.
+        """
+        if hasattr(self, "crates") and self.crates:
+            return self.crates
+        else:
+            return self._get_crate_list()
-        Each coroutine processes completely independent crate data, ensuring safety.
-        No shared state is modified - each coroutine only reads from self.api_client and
-        returns independent results.
+    async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
+        """
+        Fetches metadata for a batch of crates using asyncio-based parallel processing.
         """
-        results = []
-        async def fetch_single_crate_safe(crate_name: str) -> Optional[CrateMetadata]:
+        async def fetch_single_crate_safe(
+            crate_name: str,
+        ) -> Union[CrateMetadata, None]:
             try:
-                # If api_client has an async method, use it; otherwise, run in executor
-                if hasattr(self.api_client, 'fetch_crate_metadata_async'):
-                    data = await self.api_client.fetch_crate_metadata_async(crate_name)
-                else:
-                    loop = asyncio.get_running_loop()
-                    data = await loop.run_in_executor(None, self.api_client.fetch_crate_metadata, crate_name)
-                if data:
-                    return CrateMetadata(
-                        name=data.get("name", ""),
-                        version=data.get("version", ""),
-                        description=data.get("description", ""),
-                        repository=data.get("repository", ""),
-                        keywords=data.get("keywords", []),
-                        categories=data.get("categories", []),
-                        readme=data.get("readme", ""),
-                        downloads=data.get("downloads", 0),
-                        github_stars=data.get("github_stars", 0),
-                        dependencies=data.get("dependencies", []),
-                        features=data.get("features", []),
-                        code_snippets=data.get("code_snippets", []),
-                        readme_sections=data.get("readme_sections", {}),
-                        librs_downloads=data.get("librs_downloads"),
-                        source=data.get("source", "crates.io")
-                    )
-                return None
+                loop = asyncio.get_running_loop()
+                data = await loop.run_in_executor(
+                    None, self.api_client.fetch_crate_metadata, crate_name
+                )
+                if not data:
+                    return None
+                return CrateMetadata(
+                    name=data.get("name", ""),
+                    version=data.get("version", ""),
+                    description=data.get("description", ""),
+                    repository=data.get("repository", ""),
+                    keywords=data.get("keywords", []),
+                    categories=data.get("categories", []),
+                    readme=data.get("readme", ""),
+                    downloads=data.get("downloads", 0),
+                    github_stars=data.get("github_stars", 0),
+                    dependencies=data.get("dependencies", []),
+                    features=data.get("features", {}),
+                    code_snippets=data.get("code_snippets", []),
+                    readme_sections=data.get("readme_sections", {}),
+                    librs_downloads=data.get("librs_downloads"),
+                    source=data.get("source", "crates.io"),
+                )
             except Exception as e:
-                logging.error(f"Error fetching {crate_name}: {e}")
+                logging.error(f"Error fetching metadata for {crate_name}: {e}")
                 return None
-        # Use asyncio.gather for parallel async processing
         tasks = [fetch_single_crate_safe(name) for name in crate_names]
         results_raw = await asyncio.gather(*tasks)
-        results = [r for r in results_raw if r is not None]
-        for crate in results:
-            logging.info(f"Fetched metadata for {crate.name}")
+        results = [r for r in results_raw if r]
+        logging.info(
+            f"Fetched metadata for {len(results)} out of "
+            f"{len(crate_names)} requested crates."
+        )
         return results
-    # Remove the async methods that are no longer needed
-    # async def _fetch_single_crate_async(self, crate_name: str) ->
-    # Optional[Dict]:
-    async def enrich_batch(
-            self,
-            batch: List[CrateMetadata]) -> List[EnrichedCrate]:
-        """Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
-        # Add GitHub stats first
+    async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
+        """Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
+        # Update GitHub stats
         github_repos = [
-            c.repository for c in batch if "github.com" in c.repository]
-        repo_stats = self.github_client.batch_get_repo_stats(github_repos)
-        # Update crates with GitHub info
-        for crate in batch:
-            repo_url = crate.repository
-            if repo_url in repo_stats:
-                stats = repo_stats[repo_url]
-                crate.github_stars = stats.get("stargazers_count", 0)
-        # Enhanced scraping if available
-        if self.enhanced_scraper:
-            batch = asyncio.run(self._enhance_with_scraping(batch))
-        # Now enrich with AI
-        enriched_batch = []
-        for crate in batch:
-            try:
-                enriched = self.enricher.enrich_crate(crate)
-                enriched_batch.append(enriched)
-                logging.info(f"Enriched {crate.name}")
-            except Exception as e:
-                logging.error(f"Failed to enrich {crate.name}: {str(e)}")
-                # Add the crate with just the fields we have
-                enriched_dict = crate.__dict__.copy()
-                enriched_batch.append(EnrichedCrate(**enriched_dict))
-        return enriched_batch
-    async def _enhance_with_scraping(
-            self, batch: List[CrateMetadata]) -> List[CrateMetadata]:
-        """Enhance crates with advanced web scraping data"""
-        enhanced_batch = []
-        for crate in batch:
-            try:                # Scrape comprehensive documentation
-                scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
-                # Integrate scraping results into crate metadata
-                enhanced_crate = self._integrate_scraping_results(
-                    crate, scraping_results)
-                enhanced_batch.append(enhanced_crate)
+            c.repository for c in batch if c.repository and "github.com" in c.repository
+        ]
+        if github_repos:
+            repo_stats = self.github_client.batch_get_repo_stats(github_repos)
+            for crate in batch:
+                if crate.repository in repo_stats:
+                    stats = repo_stats[crate.repository]
+                    crate.github_stars = stats.get("stargazers_count", 0)
+        # Asynchronously enhance with scraping and AI
+        enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
+        enriched_results = await asyncio.gather(*enrichment_tasks)
+        return [result for result in enriched_results if result]
+    async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
+        """Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
+        try:
+            # Enhanced scraping if available
+            if self.enhanced_scraper:
+                await self._enhance_with_scraping(crate)
+            # Now enrich with AI
+            enriched = self.enricher.enrich_crate(crate)
+            # Add cargo analysis if we have a local crate directory
+            # Note: This would require downloading/cloning the crate first
+            # For now, we'll add a placeholder for cargo analysis
+            enriched.source_analysis = {
+                "cargo_analysis_available": False,
+                "note": "Cargo analysis requires local crate source code"
+            }
+            logging.info(f"Enriched {crate.name}")
+            return enriched
+        except Exception as e:
+            logging.error(f"Failed to enrich {crate.name}: {e}")
+            # Return a partially enriched crate to avoid data loss
+            enriched_dict = crate.to_dict()
+            return EnrichedCrate(**enriched_dict)
+    async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
+        """
+        Enhances a single crate with advanced web scraping data.
+        Modifies the crate object in place.
+        """
+        if not self.enhanced_scraper:
+            return
+        try:
+            scraping_results = await self.enhanced_scraper.scrape_crate_documentation(crate.name)
+            if scraping_results:
+                self._integrate_scraping_results(crate, scraping_results)
                 logging.info(
-                    f"Enhanced scraping for {crate.name}: {len(scraping_results)} sources")
-            except Exception as e:
-                logging.warning(
-                    f"Enhanced scraping failed for {crate.name}: {e}")
-                enhanced_batch.append(crate)
-        return enhanced_batch
-    def _integrate_scraping_results(self,
-                                    crate: CrateMetadata,
-                                    scraping_results: Dict[str,
-                                                           EnhancedScrapingResult]) -> CrateMetadata:
-        """Integrate enhanced scraping results into crate metadata"""
-        # Create a copy of the crate to avoid modifying the original
-        enhanced_crate = CrateMetadata(**crate.__dict__)
-        # Add enhanced scraping data
-        enhanced_crate.enhanced_scraping = {}
+                    f"Enhanced scraping for {crate.name}: "
+                    f"{len(scraping_results)} sources"
+                )
+        except Exception as e:
+            logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
+    def _integrate_scraping_results(
+        self,
+        crate: CrateMetadata,
+        scraping_results: "Dict[str, Any]",
+    ) -> None:
+        """
+        Integrates enhanced scraping results into the crate metadata.
+        Modifies the crate object in place.
+        """
+        crate.enhanced_scraping = {}
         for source, result in scraping_results.items():
-            if result.error:
+            if not result or result.error:
                 continue
-            enhanced_crate.enhanced_scraping[source] = {
-                'title': result.title,
-                'quality_score': result.quality_score,
-                'extraction_method': result.extraction_method,
-                'structured_data': result.structured_data,
-                'content_length': len(result.content)
-            }            # Update README if we got better content
-            if source == 'docs_rs' and result.quality_score > 0.7:
-                if not enhanced_crate.readme or len(
-                        result.content) > len(
-                        enhanced_crate.readme):
-                    enhanced_crate.readme = result.content
-                    logging.info(
-                        f"Updated README for {crate.name} from {source}")
+            crate.enhanced_scraping[source] = {
+                "title": result.title,
+                "quality_score": result.quality_score,
+                "extraction_method": result.extraction_method,
+                "structured_data": result.structured_data,
+                "content_length": len(result.content),
+            }
+            # Update README if we got better content
+            if source == "docs_rs" and result.quality_score > 0.7:
+                if not crate.readme or len(result.content) > len(crate.readme):
+                    crate.readme = result.content
+                    logging.info(f"Updated README for {crate.name} from {source}")
             # Extract additional metadata from structured data
-            if result.structured_data:
-                if 'features' in result.structured_data and isinstance(
-                        result.structured_data['features'], list):
-                    enhanced_crate.enhanced_features = result.structured_data['features']
-                if 'dependencies' in result.structured_data and isinstance(
-                        result.structured_data['dependencies'], list):
-                    enhanced_crate.enhanced_dependencies = result.structured_data['dependencies']
-                if 'examples' in result.structured_data and isinstance(
-                        result.structured_data['examples'], list):
-                    enhanced_crate.code_snippets.extend(
-                        result.structured_data['examples'])
-        return enhanced_crate
-    def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
-        """Analyze dependencies between crates"""
+            structured_data = result.structured_data or {}
+            if "features" in structured_data and isinstance(
+                structured_data["features"], list
+            ):
+                crate.enhanced_features = structured_data["features"]
+            if "dependencies" in structured_data and isinstance(
+                structured_data["dependencies"], list
+            ):
+                crate.enhanced_dependencies = structured_data["dependencies"]
+            if "examples" in structured_data and isinstance(
+                structured_data["examples"], list
+            ):
+                crate.code_snippets.extend(structured_data["examples"])
+    def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
+        """Analyze dependencies between crates."""
         return DependencyAnalyzer.analyze_dependencies(crates)
-    def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
-        """Save processing checkpoint with status metadata"""
+    def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
+        """Saves a processing checkpoint to a file."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
         with open(filename, "w") as f:
             for item in data:
-                # Convert to dict for serialization
-                item_dict = item.__dict__.copy()
-                f.write(json.dumps(item_dict) + "\n")
-        # Save status metadata
-        status = {
-            "timestamp": timestamp,
-            "total_crates": len(data),
-            "processed_crates": sum(
-                1 for c in data if c.use_case is not None),
-            "advanced_analysis": sum(
-                1 for c in data if c.source_analysis is not None),
-            "checkpoint_file": filename}
-        status_file = os.path.join(
-            self.output_dir,
-            f"{prefix}_status_{timestamp}.json")
-        with open(status_file, "w") as f:
-            json.dump(status, f, indent=2)
+                f.write(json.dumps(item.to_dict()) + "\n")
         logging.info(f"Saved checkpoint to {filename}")
         return filename
     def save_final_output(
-            self,
-            data: List[EnrichedCrate],
-            dependency_data: Dict):
-        """Save final enriched data and analysis"""
+        self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
+    ) -> None:
+        """Saves the final enriched data and analysis reports."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         # Save main enriched data
-        final_output = os.path.join(
-            self.output_dir,
-            f"enriched_crate_metadata_{timestamp}.jsonl")
-        with open(final_output, "w") as f:
+        final_output_path = os.path.join(
+            self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
+        )
+        with open(final_output_path, "w") as f:
             for item in data:
-                item_dict = item.__dict__.copy()
-                f.write(json.dumps(item_dict) + "\n")
+                f.write(json.dumps(item.to_dict()) + "\n")
         # Save dependency analysis
-        dep_file = os.path.join(
-            self.output_dir,
-            f"dependency_analysis_{timestamp}.json")
-        with open(dep_file, "w") as f:
+        dep_file_path = os.path.join(
+            self.output_dir, f"dependency_analysis_{timestamp}.json"
+        )
+        with open(dep_file_path, "w") as f:
             json.dump(dependency_data, f, indent=2)
-        # Generate summary report
+        # Generate and save summary report
+        self._generate_summary_report(data, dependency_data, timestamp)
+        logging.info(f"Results saved to {self.output_dir}/")
+    def _generate_summary_report(
+        self,
+        data: "List[EnrichedCrate]",
+        dependency_data: "Dict[str, Any]",
+        timestamp: str,
+    ) -> None:
+        """Generates a summary report of the pipeline run."""
         summary = {
             "total_crates": len(data),
             "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-            "most_popular": sorted([{
-                "name": c.name,
-                "score": c.score or 0,
-                "downloads": c.downloads,
-                "github_stars": c.github_stars
-            } for c in data], key=lambda x: x["score"], reverse=True)[:5],
-            "most_depended_upon": dependency_data.get("most_depended", [])[:5]
+            "most_popular": sorted(
+                [
+                    {
+                        "name": c.name,
+                        "score": c.score or 0,
+                        "downloads": c.downloads,
+                        "github_stars": c.github_stars,
+                    }
+                    for c in data
+                ],
+                key=lambda x: x.get("score", 0),
+                reverse=True,
+            )[:10],
+            "most_depended_upon": dependency_data.get("most_depended", [])[:10],
         }
-        summary_file = os.path.join(
-            self.output_dir,
-            f"summary_report_{timestamp}.json")
-        with open(summary_file, "w") as f:
+        summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
+        with open(summary_path, "w") as f:
             json.dump(summary, f, indent=2)
-        logging.info(f"Results saved to {self.output_dir}/")
-    async def run(self):
-        """Main pipeline execution flow (async)"""
+    async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
+        """Main pipeline execution flow."""
         start_time = time.time()
+        if not self.crates:
+            logging.error("No crates to process. Exiting.")
+            return None
         logging.info(f"Processing {len(self.crates)} crates...")
-        # Process in batches
-        all_enriched = []
-        crate_batches = [self.crates[i:i + self.config.batch_size]
-                         for i in range(0, len(self.crates), self.config.batch_size)]
+        all_enriched: "List[EnrichedCrate]" = []
+        batch_size = self.config.batch_size
+        crate_batches = [
+            self.crates[i : i + batch_size]
+            for i in range(0, len(self.crates), batch_size)
+        ]
-        for batch_num, batch in enumerate(crate_batches):
+        for i, batch_names in enumerate(crate_batches):
             logging.info(
-                f"Processing batch {batch_num + 1}/{len(crate_batches)} ({len(batch)} crates)")
-            # Fetch metadata (async)
-            batch_data = await self.fetch_metadata_batch(batch)
+                f"Processing batch {i + 1}/{len(crate_batches)} "
+                f"({len(batch_names)} crates)"
+            )
+            # Fetch metadata
+            metadata_batch = await self.fetch_metadata_batch(batch_names)
+            if not metadata_batch:
+                logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
+                continue
-            # Enrich the batch (async)
-            enriched_batch = await self.enrich_batch(batch_data)
+            # Enrich the batch
+            enriched_batch = await self.enrich_batch(metadata_batch)
             all_enriched.extend(enriched_batch)
-            # Save checkpoint after each batch
-            self.save_checkpoint(all_enriched, "batch_checkpoint")
+            # Save checkpoint
+            self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
             logging.info(
-                f"Completed batch {batch_num + 1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
-            # Optional: Add source analysis for some crates
-            if batch_num < 2:  # Only do detailed analysis for first 2 batches
-                for crate in enriched_batch:
-                    try:
-                        crate.source_analysis = SourceAnalyzer.analyze_crate_source(
-                            crate)
-                        crate.security = SecurityAnalyzer.check_security_metrics(
-                            crate)
-                        crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
-                            crate)
-                        logging.info(
-                            f"Advanced analysis completed for {crate.name}")
-                    except Exception as e:
-                        logging.warning(
-                            f"Advanced analysis failed for {crate.name}: {str(e)}")
-        # Step 3: Perform dependency analysis
+                f"Completed batch {i + 1}, "
+                f"processed {len(all_enriched)}/{len(self.crates)} crates"
+            )
+        # Final analysis and saving
         logging.info("Analyzing crate dependencies...")
         dependency_analysis = self.analyze_dependencies(all_enriched)
-        # Save final results
         self.save_final_output(all_enriched, dependency_analysis)
-        # Final summary
         duration = time.time() - start_time
-        logging.info(
-            f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
+        logging.info(f"[OK] Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
         return all_enriched, dependency_analysis

rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

rust-crate-pipeline 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl