PyPI - rust-crate-pipeline - Versions diffs - 1.2.6__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

rust_crate_pipeline/__init__.py +25 -25
rust_crate_pipeline/__main__.py +1 -0
rust_crate_pipeline/ai_processing.py +309 -200
rust_crate_pipeline/analysis.py +304 -368
rust_crate_pipeline/azure_ai_processing.py +453 -0
rust_crate_pipeline/config.py +57 -19
rust_crate_pipeline/core/__init__.py +19 -0
rust_crate_pipeline/core/canon_registry.py +133 -0
rust_crate_pipeline/core/irl_engine.py +256 -0
rust_crate_pipeline/core/sacred_chain.py +117 -0
rust_crate_pipeline/crate_analysis.py +54 -0
rust_crate_pipeline/crate_list.txt +424 -0
rust_crate_pipeline/github_token_checker.py +42 -36
rust_crate_pipeline/main.py +386 -102
rust_crate_pipeline/network.py +153 -133
rust_crate_pipeline/pipeline.py +340 -264
rust_crate_pipeline/production_config.py +35 -32
rust_crate_pipeline/scraping/__init__.py +13 -0
rust_crate_pipeline/scraping/unified_scraper.py +259 -0
rust_crate_pipeline/unified_llm_processor.py +637 -0
rust_crate_pipeline/unified_pipeline.py +548 -0
rust_crate_pipeline/utils/file_utils.py +45 -14
rust_crate_pipeline/utils/logging_utils.py +34 -17
rust_crate_pipeline/version.py +47 -2
rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/pipeline.py CHANGED Viewed

@@ -3,319 +3,395 @@ import os
 import time
 import logging
 import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from tqdm import tqdm
-from typing import List, Dict, Optional
+import asyncio
+from typing import Any, Union, TYPE_CHECKING
+if TYPE_CHECKING:
+    from typing import Dict, List, Optional
 from .config import PipelineConfig, CrateMetadata, EnrichedCrate
 from .network import CrateAPIClient, GitHubBatchClient
 from .ai_processing import LLMEnricher
-from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
+from .analysis import DependencyAnalyzer
+from .crate_analysis import CrateAnalyzer
+# Import Azure OpenAI enricher
+try:
+    from .azure_ai_processing import AzureOpenAIEnricher
+    AZURE_OPENAI_AVAILABLE = True
+except ImportError:
+    AZURE_OPENAI_AVAILABLE = False
+    AzureOpenAIEnricher = None
+# Import enhanced scraping capabilities
+try:
+    import sys
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    from enhanced_scraping import (
+        CrateDocumentationScraper,
+        EnhancedScrapingResult,
+    )
+    ENHANCED_SCRAPING_AVAILABLE = True
+except ImportError:
+    ENHANCED_SCRAPING_AVAILABLE = False
+    CrateDocumentationScraper = None  # type: ignore[assignment,misc]
+    EnhancedScrapingResult = None  # type: ignore[assignment,misc]
+    logging.warning("Enhanced scraping not available - using basic methods")
 class CrateDataPipeline:
-    def __init__(self, config: PipelineConfig):
+    """Orchestrates the entire data collection, enrichment, and analysis pipeline."""
+    def __init__(self, config: PipelineConfig) -> None:
         self.config = config
         self.api_client = CrateAPIClient(config)
         self.github_client = GitHubBatchClient(config)
-        self.enricher = LLMEnricher(config)
-        self.crates = self.get_crate_list()
-        self.output_dir = self._create_output_dir()
+        # Initialize the appropriate AI enricher based on configuration
+        if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
+            try:
+                self.enricher = AzureOpenAIEnricher(config)
+                logging.info("✅ Using Azure OpenAI enricher")
+            except Exception as e:
+                logging.warning(f"⚠️ Failed to initialize Azure OpenAI enricher: {e}")
+                logging.info("🔄 Falling back to local LLM enricher")
+                self.enricher = LLMEnricher(config)
+        else:
+            if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
+                logging.warning("⚠️ Azure OpenAI requested but not available")
+            self.enricher = LLMEnricher(config)
+            logging.info("✅ Using local LLM enricher")
+        # Initialize cargo analyzer
+        self.cargo_analyzer = CrateAnalyzer(".")
+        self.crates = self._get_crate_list()
+        self.output_dir = self._create_output_dir()
+        self.enhanced_scraper: Union[CrateDocumentationScraper, None] = (
+            self._initialize_enhanced_scraper()
+        )
+    def _initialize_enhanced_scraper(self) -> Union[CrateDocumentationScraper, None]:
+        """Initializes the CrateDocumentationScraper if available and enabled."""
+        if (
+            not ENHANCED_SCRAPING_AVAILABLE
+            or not self.config.enable_crawl4ai
+            or CrateDocumentationScraper is None
+        ):
+            return None
+        try:
+            scraper = CrateDocumentationScraper()
+            logging.info("✅ Enhanced scraping with Crawl4AI enabled")
+            return scraper
+        except Exception as e:
+            logging.warning(f"❌ Failed to initialize enhanced scraping: {e}")
+            return None
     def _create_output_dir(self) -> str:
+        """Creates a timestamped output directory for pipeline results."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
-        output_dir = f"crate_data_{timestamp}"
+        output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
         os.makedirs(output_dir, exist_ok=True)
         return output_dir
-    def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
-        """Return a comprehensive list of all high-value crates to process"""
-        crates = [
-            # Web frameworks and servers
-            "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
-            "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
-            "tiny_http", "httptest", "mockito", "wiremock",
-            # Async runtimes and utilities
-            "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
-            "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
-            "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
-            "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
-            # Serialization/deserialization
-            "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
-            "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
-            "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
-            # Error handling and debugging
-            "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
-            "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
-              # Command line and terminal
-            "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
-            "crossterm", "termion", "console", "indicatif", "dialoguer", "termcolor",
-            "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
-              # Utilities and general purpose
-            "rand", "uuid", "itertools", "num", "cfg-if", "bytes", "mime",
-            "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
-            "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
-            "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
-            "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
-            "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
-              # HTTP clients and servers
-            "reqwest", "hyper", "surf", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
-            "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
-            "webpki", "webpki-roots",
-            # Database and storage
-            "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
-            "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
-            "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
-              # Concurrency and parallelism
-            "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
-            "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
-            "flume", "kanal", "tokio-util", "futures-concurrency",
-              # Protocol buffers, gRPC, and messaging
-            "prost", "tonic", "protobuf", "grpcio", "tarpc", "capnp", "rmp",
-            "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
-              # Procedural macros and metaprogramming
-            "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
-            "darling", "derive_builder", "strum", "strum_macros",
-            "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
-            # Cryptography and security
-            "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
-            "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
-            "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
-            "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
-            # Game development and graphics
-            "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
-            "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
-            "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
-              # Networking and protocols
-            "socket2", "mio", "polling", "async-io", "calloop", "quinn",
-            "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
-            "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
-            # Text processing and parsing
-            "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
-            "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
-            "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
-            # System programming and OS interfaces
-            "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
-            "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
-            "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
-              # Testing and development tools
-            "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
-            "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
-            "insta", "goldenfile", "similar", "difference", "pretty_assertions",
-            # Configuration and environment
-            "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
-            "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
-            # Logging and observability
-            "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
-            "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
-            "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
-            # Time and date
-            "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
-            "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
-            # Machine Learning & AI
-            "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
-            "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
-            "tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
-            "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
-            "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
-            "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
-            "onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
-            "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
-            "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
-            "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
-            "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
-            "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
+    def _get_crate_list(self) -> "List[str]":
+        """
+        Loads the list of crates to process from an external file.
+        This approach is more modular and easier to maintain than a hardcoded list.
+        """
+        crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
+        try:
+            with open(crate_list_path) as f:
+                crates = [line.strip() for line in f if line.strip()]
+            logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
+            if not crates:
+                logging.warning(f"Crate list at {crate_list_path} is empty.")
+            return crates
+        except FileNotFoundError:
+            logging.error(f"Crate list file not found at: {crate_list_path}")
+            return []
+    def get_crate_list(self) -> "List[str]":
+        """
+        Public method to get the list of crates.
+        Returns the already loaded crate list or loads it if not available.
+        """
+        if hasattr(self, "crates") and self.crates:
+            return self.crates
+        else:
+            return self._get_crate_list()
+    async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
+        """
+        Fetches metadata for a batch of crates using asyncio-based parallel processing.
+        """
+        async def fetch_single_crate_safe(
+            crate_name: str,
+        ) -> Union[CrateMetadata, None]:
+            try:
+                loop = asyncio.get_running_loop()
+                data = await loop.run_in_executor(
+                    None, self.api_client.fetch_crate_metadata, crate_name
+                )
+                if not data:
+                    return None
+                return CrateMetadata(
+                    name=data.get("name", ""),
+                    version=data.get("version", ""),
+                    description=data.get("description", ""),
+                    repository=data.get("repository", ""),
+                    keywords=data.get("keywords", []),
+                    categories=data.get("categories", []),
+                    readme=data.get("readme", ""),
+                    downloads=data.get("downloads", 0),
+                    github_stars=data.get("github_stars", 0),
+                    dependencies=data.get("dependencies", []),
+                    features=data.get("features", {}),
+                    code_snippets=data.get("code_snippets", []),
+                    readme_sections=data.get("readme_sections", {}),
+                    librs_downloads=data.get("librs_downloads"),
+                    source=data.get("source", "crates.io"),
+                )
+            except Exception as e:
+                logging.error(f"Error fetching metadata for {crate_name}: {e}")
+                return None
+        tasks = [fetch_single_crate_safe(name) for name in crate_names]
+        results_raw = await asyncio.gather(*tasks)
+        results = [r for r in results_raw if r]
+        logging.info(
+            f"Fetched metadata for {len(results)} out of "
+            f"{len(crate_names)} requested crates."
+        )
+        return results
+    async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
+        """Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
+        # Update GitHub stats
+        github_repos = [
+            c.repository for c in batch if c.repository and "github.com" in c.repository
         ]
-        if limit is not None:
-            return crates[:limit]
-        return crates
-    def fetch_metadata_batch(self, crate_names: List[str]) -> List[CrateMetadata]:
-        """Fetch metadata for a batch of crates in parallel"""
-        with ThreadPoolExecutor(max_workers=self.config.n_workers) as executor:
-            futures = {executor.submit(self.api_client.fetch_crate_metadata, name): name
-                      for name in crate_names}
+        if github_repos:
+            repo_stats = self.github_client.batch_get_repo_stats(github_repos)
+            for crate in batch:
+                if crate.repository in repo_stats:
+                    stats = repo_stats[crate.repository]
+                    crate.github_stars = stats.get("stargazers_count", 0)
+        # Asynchronously enhance with scraping and AI
+        enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
+        enriched_results = await asyncio.gather(*enrichment_tasks)
+        return [result for result in enriched_results if result]
+    async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
+        """Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
+        try:
+            # Enhanced scraping if available
+            if self.enhanced_scraper:
+                await self._enhance_with_scraping(crate)
+            # Now enrich with AI
+            enriched = self.enricher.enrich_crate(crate)
-            results = []
-            for future in as_completed(futures):
-                crate_name = futures[future]
-                try:
-                    data = future.result()
-                    if data:
-                        # Convert dict to CrateMetadata
-                        crate_metadata = CrateMetadata(
-                            name=data.get("name", ""),
-                            version=data.get("version", ""),
-                            description=data.get("description", ""),
-                            repository=data.get("repository", ""),
-                            keywords=data.get("keywords", []),
-                            categories=data.get("categories", []),
-                            readme=data.get("readme", ""),
-                            downloads=data.get("downloads", 0),
-                            github_stars=data.get("github_stars", 0),
-                            dependencies=data.get("dependencies", []),
-                            features=data.get("features", []),
-                            code_snippets=data.get("code_snippets", []),
-                            readme_sections=data.get("readme_sections", {}),
-                            librs_downloads=data.get("librs_downloads"),
-                            source=data.get("source", "crates.io")
-                        )
-                        results.append(crate_metadata)
-                        logging.info(f"Fetched metadata for {crate_name}")
-                except Exception as e:
-                    logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
+            # Add cargo analysis if we have a local crate directory
+            # Note: This would require downloading/cloning the crate first
+            # For now, we'll add a placeholder for cargo analysis
+            enriched.source_analysis = {
+                "cargo_analysis_available": False,
+                "note": "Cargo analysis requires local crate source code"
+            }
-            return results
+            logging.info(f"Enriched {crate.name}")
+            return enriched
+        except Exception as e:
+            logging.error(f"Failed to enrich {crate.name}: {e}")
+            # Return a partially enriched crate to avoid data loss
+            enriched_dict = crate.to_dict()
+            return EnrichedCrate(**enriched_dict)
-    def enrich_batch(self, batch: List[CrateMetadata]) -> List[EnrichedCrate]:
-        """Enrich a batch of crates with GitHub stats and AI"""
-        # Add GitHub stats first
-        github_repos = [c.repository for c in batch if "github.com" in c.repository]
-        repo_stats = self.github_client.batch_get_repo_stats(github_repos)
-        # Update crates with GitHub info
-        for crate in batch:
-            repo_url = crate.repository
-            if repo_url in repo_stats:
-                stats = repo_stats[repo_url]
-                crate.github_stars = stats.get("stargazers_count", 0)
-        # Now enrich with AI
-        enriched_batch = []
-        for crate in batch:
-            try:
-                enriched = self.enricher.enrich_crate(crate)
-                enriched_batch.append(enriched)
-                logging.info(f"Enriched {crate.name}")
-            except Exception as e:
-                logging.error(f"Failed to enrich {crate.name}: {str(e)}")
-                # Add the crate with just the fields we have
-                enriched_dict = crate.__dict__.copy()
-                enriched_batch.append(EnrichedCrate(**enriched_dict))
-        return enriched_batch
-    def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
-        """Analyze dependencies between crates"""
+    async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
+        """
+        Enhances a single crate with advanced web scraping data.
+        Modifies the crate object in place.
+        """
+        if not self.enhanced_scraper:
+            return
+        try:
+            scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
+            if scraping_results:
+                self._integrate_scraping_results(crate, scraping_results)
+                logging.info(
+                    f"Enhanced scraping for {crate.name}: "
+                    f"{len(scraping_results)} sources"
+                )
+        except Exception as e:
+            logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
+    def _integrate_scraping_results(
+        self,
+        crate: CrateMetadata,
+        scraping_results: "Dict[str, EnhancedScrapingResult]",
+    ) -> None:
+        """
+        Integrates enhanced scraping results into the crate metadata.
+        Modifies the crate object in place.
+        """
+        crate.enhanced_scraping = {}
+        for source, result in scraping_results.items():
+            if not result or result.error:
+                continue
+            crate.enhanced_scraping[source] = {
+                "title": result.title,
+                "quality_score": result.quality_score,
+                "extraction_method": result.extraction_method,
+                "structured_data": result.structured_data,
+                "content_length": len(result.content),
+            }
+            # Update README if we got better content
+            if source == "docs_rs" and result.quality_score > 0.7:
+                if not crate.readme or len(result.content) > len(crate.readme):
+                    crate.readme = result.content
+                    logging.info(f"Updated README for {crate.name} from {source}")
+            # Extract additional metadata from structured data
+            structured_data = result.structured_data or {}
+            if "features" in structured_data and isinstance(
+                structured_data["features"], list
+            ):
+                crate.enhanced_features = structured_data["features"]
+            if "dependencies" in structured_data and isinstance(
+                structured_data["dependencies"], list
+            ):
+                crate.enhanced_dependencies = structured_data["dependencies"]
+            if "examples" in structured_data and isinstance(
+                structured_data["examples"], list
+            ):
+                crate.code_snippets.extend(structured_data["examples"])
+    def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
+        """Analyze dependencies between crates."""
         return DependencyAnalyzer.analyze_dependencies(crates)
-    def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
-        """Save processing checkpoint with status metadata"""
+    def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
+        """Saves a processing checkpoint to a file."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
         with open(filename, "w") as f:
             for item in data:
-                # Convert to dict for serialization
-                item_dict = item.__dict__.copy()
-                f.write(json.dumps(item_dict) + "\n")
-        # Save status metadata
-        status = {
-            "timestamp": timestamp,
-            "total_crates": len(data),
-            "processed_crates": sum(1 for c in data if c.use_case is not None),
-            "advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
-            "checkpoint_file": filename
-        }
-        status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
-        with open(status_file, "w") as f:
-            json.dump(status, f, indent=2)
+                f.write(json.dumps(item.to_dict()) + "\n")
         logging.info(f"Saved checkpoint to {filename}")
         return filename
-    def save_final_output(self, data: List[EnrichedCrate], dependency_data: Dict):
-        """Save final enriched data and analysis"""
+    def save_final_output(
+        self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
+    ) -> None:
+        """Saves the final enriched data and analysis reports."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         # Save main enriched data
-        final_output = os.path.join(self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl")
-        with open(final_output, "w") as f:
+        final_output_path = os.path.join(
+            self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
+        )
+        with open(final_output_path, "w") as f:
             for item in data:
-                item_dict = item.__dict__.copy()
-                f.write(json.dumps(item_dict) + "\n")
+                f.write(json.dumps(item.to_dict()) + "\n")
         # Save dependency analysis
-        dep_file = os.path.join(self.output_dir, f"dependency_analysis_{timestamp}.json")
-        with open(dep_file, "w") as f:
+        dep_file_path = os.path.join(
+            self.output_dir, f"dependency_analysis_{timestamp}.json"
+        )
+        with open(dep_file_path, "w") as f:
             json.dump(dependency_data, f, indent=2)
-        # Generate summary report
+        # Generate and save summary report
+        self._generate_summary_report(data, dependency_data, timestamp)
+        logging.info(f"Results saved to {self.output_dir}/")
+    def _generate_summary_report(
+        self,
+        data: "List[EnrichedCrate]",
+        dependency_data: "Dict[str, Any]",
+        timestamp: str,
+    ) -> None:
+        """Generates a summary report of the pipeline run."""
         summary = {
             "total_crates": len(data),
             "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-            "most_popular": sorted([{
-                "name": c.name,
-                "score": c.score or 0,
-                "downloads": c.downloads,
-                "github_stars": c.github_stars
-            } for c in data], key=lambda x: x["score"], reverse=True)[:5],
-            "most_depended_upon": dependency_data.get("most_depended", [])[:5]
+            "most_popular": sorted(
+                [
+                    {
+                        "name": c.name,
+                        "score": c.score or 0,
+                        "downloads": c.downloads,
+                        "github_stars": c.github_stars,
+                    }
+                    for c in data
+                ],
+                key=lambda x: x.get("score", 0),
+                reverse=True,
+            )[:10],
+            "most_depended_upon": dependency_data.get("most_depended", [])[:10],
         }
-        summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
-        with open(summary_file, "w") as f:
-            json.dump(summary, f, indent=2)
-        logging.info(f"Results saved to {self.output_dir}/")
+        summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
+        with open(summary_path, "w") as f:
+            json.dump(summary, f, indent=2)
-    def run(self):
-        """Main pipeline execution flow"""
+    async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
+        """Main pipeline execution flow."""
         start_time = time.time()
+        if not self.crates:
+            logging.error("No crates to process. Exiting.")
+            return None
         logging.info(f"Processing {len(self.crates)} crates...")
-        # Process in batches
-        all_enriched = []
-        crate_batches = [self.crates[i:i+self.config.batch_size]
-                         for i in range(0, len(self.crates), self.config.batch_size)]
-        for batch_num, batch in enumerate(crate_batches):
-            logging.info(f"Processing batch {batch_num+1}/{len(crate_batches)} ({len(batch)} crates)")
+        all_enriched: "List[EnrichedCrate]" = []
+        batch_size = self.config.batch_size
+        crate_batches = [
+            self.crates[i : i + batch_size]
+            for i in range(0, len(self.crates), batch_size)
+        ]
+        for i, batch_names in enumerate(crate_batches):
+            logging.info(
+                f"Processing batch {i + 1}/{len(crate_batches)} "
+                f"({len(batch_names)} crates)"
+            )
             # Fetch metadata
-            batch_data = self.fetch_metadata_batch(batch)
+            metadata_batch = await self.fetch_metadata_batch(batch_names)
+            if not metadata_batch:
+                logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
+                continue
             # Enrich the batch
-            enriched_batch = self.enrich_batch(batch_data)
+            enriched_batch = await self.enrich_batch(metadata_batch)
             all_enriched.extend(enriched_batch)
-            # Save checkpoint after each batch
-            self.save_checkpoint(all_enriched, "batch_checkpoint")
-            logging.info(f"Completed batch {batch_num+1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
-            # Optional: Add source analysis for some crates
-            if batch_num < 2:  # Only do detailed analysis for first 2 batches
-                for crate in enriched_batch:
-                    try:
-                        crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
-                        crate.security = SecurityAnalyzer.check_security_metrics(crate)
-                        crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
-                        logging.info(f"Advanced analysis completed for {crate.name}")
-                    except Exception as e:
-                        logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
-        # Step 3: Perform dependency analysis
+            # Save checkpoint
+            self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
+            logging.info(
+                f"Completed batch {i + 1}, "
+                f"processed {len(all_enriched)}/{len(self.crates)} crates"
+            )
+        # Final analysis and saving
         logging.info("Analyzing crate dependencies...")
         dependency_analysis = self.analyze_dependencies(all_enriched)
-        # Save final results
         self.save_final_output(all_enriched, dependency_analysis)
-        # Final summary
         duration = time.time() - start_time
         logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
         return all_enriched, dependency_analysis

rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.0__py3-none-any.whl

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.3.0py3-none-any.whl