rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. rust_crate_pipeline/__init__.py +25 -25
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +309 -200
  4. rust_crate_pipeline/analysis.py +304 -368
  5. rust_crate_pipeline/azure_ai_processing.py +453 -0
  6. rust_crate_pipeline/config.py +57 -19
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +42 -36
  14. rust_crate_pipeline/main.py +386 -102
  15. rust_crate_pipeline/network.py +153 -133
  16. rust_crate_pipeline/pipeline.py +340 -264
  17. rust_crate_pipeline/production_config.py +35 -32
  18. rust_crate_pipeline/scraping/__init__.py +13 -0
  19. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  20. rust_crate_pipeline/unified_llm_processor.py +637 -0
  21. rust_crate_pipeline/unified_pipeline.py +548 -0
  22. rust_crate_pipeline/utils/file_utils.py +45 -14
  23. rust_crate_pipeline/utils/logging_utils.py +34 -17
  24. rust_crate_pipeline/version.py +47 -2
  25. rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
  26. rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
  27. rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
  28. rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
  29. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
  30. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
  31. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
  32. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
@@ -3,319 +3,395 @@ import os
3
3
  import time
4
4
  import logging
5
5
  import json
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from tqdm import tqdm
8
- from typing import List, Dict, Optional
6
+ import asyncio
7
+ from typing import Any, Union, TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import Dict, List, Optional
11
+
9
12
  from .config import PipelineConfig, CrateMetadata, EnrichedCrate
10
13
  from .network import CrateAPIClient, GitHubBatchClient
11
14
  from .ai_processing import LLMEnricher
12
- from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
15
+ from .analysis import DependencyAnalyzer
16
+ from .crate_analysis import CrateAnalyzer
17
+
18
+ # Import Azure OpenAI enricher
19
+ try:
20
+ from .azure_ai_processing import AzureOpenAIEnricher
21
+ AZURE_OPENAI_AVAILABLE = True
22
+ except ImportError:
23
+ AZURE_OPENAI_AVAILABLE = False
24
+ AzureOpenAIEnricher = None
25
+
26
+ # Import enhanced scraping capabilities
27
+ try:
28
+ import sys
29
+
30
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
31
+ from enhanced_scraping import (
32
+ CrateDocumentationScraper,
33
+ EnhancedScrapingResult,
34
+ )
35
+
36
+ ENHANCED_SCRAPING_AVAILABLE = True
37
+ except ImportError:
38
+ ENHANCED_SCRAPING_AVAILABLE = False
39
+ CrateDocumentationScraper = None # type: ignore[assignment,misc]
40
+ EnhancedScrapingResult = None # type: ignore[assignment,misc]
41
+ logging.warning("Enhanced scraping not available - using basic methods")
42
+
13
43
 
14
44
  class CrateDataPipeline:
15
- def __init__(self, config: PipelineConfig):
45
+ """Orchestrates the entire data collection, enrichment, and analysis pipeline."""
46
+
47
+ def __init__(self, config: PipelineConfig) -> None:
16
48
  self.config = config
17
49
  self.api_client = CrateAPIClient(config)
18
50
  self.github_client = GitHubBatchClient(config)
19
- self.enricher = LLMEnricher(config)
20
- self.crates = self.get_crate_list()
21
- self.output_dir = self._create_output_dir()
22
51
 
52
+ # Initialize the appropriate AI enricher based on configuration
53
+ if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
54
+ try:
55
+ self.enricher = AzureOpenAIEnricher(config)
56
+ logging.info("✅ Using Azure OpenAI enricher")
57
+ except Exception as e:
58
+ logging.warning(f"⚠️ Failed to initialize Azure OpenAI enricher: {e}")
59
+ logging.info("🔄 Falling back to local LLM enricher")
60
+ self.enricher = LLMEnricher(config)
61
+ else:
62
+ if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
63
+ logging.warning("⚠️ Azure OpenAI requested but not available")
64
+ self.enricher = LLMEnricher(config)
65
+ logging.info("✅ Using local LLM enricher")
66
+
67
+ # Initialize cargo analyzer
68
+ self.cargo_analyzer = CrateAnalyzer(".")
69
+
70
+ self.crates = self._get_crate_list()
71
+ self.output_dir = self._create_output_dir()
72
+ self.enhanced_scraper: Union[CrateDocumentationScraper, None] = (
73
+ self._initialize_enhanced_scraper()
74
+ )
75
+
76
+ def _initialize_enhanced_scraper(self) -> Union[CrateDocumentationScraper, None]:
77
+ """Initializes the CrateDocumentationScraper if available and enabled."""
78
+ if (
79
+ not ENHANCED_SCRAPING_AVAILABLE
80
+ or not self.config.enable_crawl4ai
81
+ or CrateDocumentationScraper is None
82
+ ):
83
+ return None
84
+ try:
85
+ scraper = CrateDocumentationScraper()
86
+ logging.info("✅ Enhanced scraping with Crawl4AI enabled")
87
+ return scraper
88
+ except Exception as e:
89
+ logging.warning(f"❌ Failed to initialize enhanced scraping: {e}")
90
+ return None
91
+
23
92
  def _create_output_dir(self) -> str:
93
+ """Creates a timestamped output directory for pipeline results."""
24
94
  timestamp = time.strftime("%Y%m%d-%H%M%S")
25
- output_dir = f"crate_data_{timestamp}"
95
+ output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
26
96
  os.makedirs(output_dir, exist_ok=True)
27
97
  return output_dir
28
98
 
29
- def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
30
- """Return a comprehensive list of all high-value crates to process"""
31
- crates = [
32
- # Web frameworks and servers
33
- "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
34
- "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
35
- "tiny_http", "httptest", "mockito", "wiremock",
36
-
37
- # Async runtimes and utilities
38
- "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
39
- "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
40
- "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
41
- "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
42
-
43
- # Serialization/deserialization
44
- "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
45
- "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
46
- "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
47
-
48
- # Error handling and debugging
49
- "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
50
- "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
51
- # Command line and terminal
52
- "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
53
- "crossterm", "termion", "console", "indicatif", "dialoguer", "termcolor",
54
- "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
55
- # Utilities and general purpose
56
- "rand", "uuid", "itertools", "num", "cfg-if", "bytes", "mime",
57
- "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
58
- "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
59
- "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
60
- "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
61
- "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
62
- # HTTP clients and servers
63
- "reqwest", "hyper", "surf", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
64
- "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
65
- "webpki", "webpki-roots",
66
-
67
- # Database and storage
68
- "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
69
- "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
70
- "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
71
- # Concurrency and parallelism
72
- "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
73
- "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
74
- "flume", "kanal", "tokio-util", "futures-concurrency",
75
- # Protocol buffers, gRPC, and messaging
76
- "prost", "tonic", "protobuf", "grpcio", "tarpc", "capnp", "rmp",
77
- "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
78
- # Procedural macros and metaprogramming
79
- "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
80
- "darling", "derive_builder", "strum", "strum_macros",
81
- "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
82
-
83
- # Cryptography and security
84
- "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
85
- "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
86
- "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
87
- "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
88
-
89
- # Game development and graphics
90
- "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
91
- "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
92
- "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
93
- # Networking and protocols
94
- "socket2", "mio", "polling", "async-io", "calloop", "quinn",
95
- "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
96
- "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
97
-
98
- # Text processing and parsing
99
- "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
100
- "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
101
- "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
102
-
103
- # System programming and OS interfaces
104
- "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
105
- "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
106
- "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
107
- # Testing and development tools
108
- "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
109
- "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
110
- "insta", "goldenfile", "similar", "difference", "pretty_assertions",
111
-
112
- # Configuration and environment
113
- "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
114
- "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
115
-
116
- # Logging and observability
117
- "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
118
- "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
119
- "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
120
-
121
- # Time and date
122
- "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
123
- "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
124
-
125
- # Machine Learning & AI
126
- "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
127
- "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
128
- "tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
129
- "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
130
- "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
131
- "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
132
- "onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
133
- "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
134
- "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
135
- "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
136
- "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
137
- "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
99
+ def _get_crate_list(self) -> "List[str]":
100
+ """
101
+ Loads the list of crates to process from an external file.
102
+ This approach is more modular and easier to maintain than a hardcoded list.
103
+ """
104
+ crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
105
+ try:
106
+ with open(crate_list_path) as f:
107
+ crates = [line.strip() for line in f if line.strip()]
108
+ logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
109
+ if not crates:
110
+ logging.warning(f"Crate list at {crate_list_path} is empty.")
111
+ return crates
112
+ except FileNotFoundError:
113
+ logging.error(f"Crate list file not found at: {crate_list_path}")
114
+ return []
115
+
116
+ def get_crate_list(self) -> "List[str]":
117
+ """
118
+ Public method to get the list of crates.
119
+ Returns the already loaded crate list or loads it if not available.
120
+ """
121
+ if hasattr(self, "crates") and self.crates:
122
+ return self.crates
123
+ else:
124
+ return self._get_crate_list()
125
+
126
+ async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
127
+ """
128
+ Fetches metadata for a batch of crates using asyncio-based parallel processing.
129
+ """
130
+
131
+ async def fetch_single_crate_safe(
132
+ crate_name: str,
133
+ ) -> Union[CrateMetadata, None]:
134
+ try:
135
+ loop = asyncio.get_running_loop()
136
+ data = await loop.run_in_executor(
137
+ None, self.api_client.fetch_crate_metadata, crate_name
138
+ )
139
+ if not data:
140
+ return None
141
+
142
+ return CrateMetadata(
143
+ name=data.get("name", ""),
144
+ version=data.get("version", ""),
145
+ description=data.get("description", ""),
146
+ repository=data.get("repository", ""),
147
+ keywords=data.get("keywords", []),
148
+ categories=data.get("categories", []),
149
+ readme=data.get("readme", ""),
150
+ downloads=data.get("downloads", 0),
151
+ github_stars=data.get("github_stars", 0),
152
+ dependencies=data.get("dependencies", []),
153
+ features=data.get("features", {}),
154
+ code_snippets=data.get("code_snippets", []),
155
+ readme_sections=data.get("readme_sections", {}),
156
+ librs_downloads=data.get("librs_downloads"),
157
+ source=data.get("source", "crates.io"),
158
+ )
159
+
160
+ except Exception as e:
161
+ logging.error(f"Error fetching metadata for {crate_name}: {e}")
162
+ return None
163
+
164
+ tasks = [fetch_single_crate_safe(name) for name in crate_names]
165
+ results_raw = await asyncio.gather(*tasks)
166
+ results = [r for r in results_raw if r]
167
+ logging.info(
168
+ f"Fetched metadata for {len(results)} out of "
169
+ f"{len(crate_names)} requested crates."
170
+ )
171
+ return results
172
+
173
+ async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
174
+ """Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
175
+ # Update GitHub stats
176
+ github_repos = [
177
+ c.repository for c in batch if c.repository and "github.com" in c.repository
138
178
  ]
139
-
140
- if limit is not None:
141
- return crates[:limit]
142
- return crates
143
-
144
- def fetch_metadata_batch(self, crate_names: List[str]) -> List[CrateMetadata]:
145
- """Fetch metadata for a batch of crates in parallel"""
146
- with ThreadPoolExecutor(max_workers=self.config.n_workers) as executor:
147
- futures = {executor.submit(self.api_client.fetch_crate_metadata, name): name
148
- for name in crate_names}
179
+ if github_repos:
180
+ repo_stats = self.github_client.batch_get_repo_stats(github_repos)
181
+ for crate in batch:
182
+ if crate.repository in repo_stats:
183
+ stats = repo_stats[crate.repository]
184
+ crate.github_stars = stats.get("stargazers_count", 0)
185
+
186
+ # Asynchronously enhance with scraping and AI
187
+ enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
188
+ enriched_results = await asyncio.gather(*enrichment_tasks)
189
+ return [result for result in enriched_results if result]
190
+
191
+ async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
192
+ """Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
193
+ try:
194
+ # Enhanced scraping if available
195
+ if self.enhanced_scraper:
196
+ await self._enhance_with_scraping(crate)
197
+
198
+ # Now enrich with AI
199
+ enriched = self.enricher.enrich_crate(crate)
149
200
 
150
- results = []
151
- for future in as_completed(futures):
152
- crate_name = futures[future]
153
- try:
154
- data = future.result()
155
- if data:
156
- # Convert dict to CrateMetadata
157
- crate_metadata = CrateMetadata(
158
- name=data.get("name", ""),
159
- version=data.get("version", ""),
160
- description=data.get("description", ""),
161
- repository=data.get("repository", ""),
162
- keywords=data.get("keywords", []),
163
- categories=data.get("categories", []),
164
- readme=data.get("readme", ""),
165
- downloads=data.get("downloads", 0),
166
- github_stars=data.get("github_stars", 0),
167
- dependencies=data.get("dependencies", []),
168
- features=data.get("features", []),
169
- code_snippets=data.get("code_snippets", []),
170
- readme_sections=data.get("readme_sections", {}),
171
- librs_downloads=data.get("librs_downloads"),
172
- source=data.get("source", "crates.io")
173
- )
174
- results.append(crate_metadata)
175
- logging.info(f"Fetched metadata for {crate_name}")
176
- except Exception as e:
177
- logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
201
+ # Add cargo analysis if we have a local crate directory
202
+ # Note: This would require downloading/cloning the crate first
203
+ # For now, we'll add a placeholder for cargo analysis
204
+ enriched.source_analysis = {
205
+ "cargo_analysis_available": False,
206
+ "note": "Cargo analysis requires local crate source code"
207
+ }
178
208
 
179
- return results
209
+ logging.info(f"Enriched {crate.name}")
210
+ return enriched
211
+ except Exception as e:
212
+ logging.error(f"Failed to enrich {crate.name}: {e}")
213
+ # Return a partially enriched crate to avoid data loss
214
+ enriched_dict = crate.to_dict()
215
+ return EnrichedCrate(**enriched_dict)
180
216
 
181
- def enrich_batch(self, batch: List[CrateMetadata]) -> List[EnrichedCrate]:
182
- """Enrich a batch of crates with GitHub stats and AI"""
183
- # Add GitHub stats first
184
- github_repos = [c.repository for c in batch if "github.com" in c.repository]
185
- repo_stats = self.github_client.batch_get_repo_stats(github_repos)
186
-
187
- # Update crates with GitHub info
188
- for crate in batch:
189
- repo_url = crate.repository
190
- if repo_url in repo_stats:
191
- stats = repo_stats[repo_url]
192
- crate.github_stars = stats.get("stargazers_count", 0)
193
-
194
- # Now enrich with AI
195
- enriched_batch = []
196
- for crate in batch:
197
- try:
198
- enriched = self.enricher.enrich_crate(crate)
199
- enriched_batch.append(enriched)
200
- logging.info(f"Enriched {crate.name}")
201
- except Exception as e:
202
- logging.error(f"Failed to enrich {crate.name}: {str(e)}")
203
- # Add the crate with just the fields we have
204
- enriched_dict = crate.__dict__.copy()
205
- enriched_batch.append(EnrichedCrate(**enriched_dict))
206
-
207
- return enriched_batch
208
-
209
- def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
210
- """Analyze dependencies between crates"""
217
+ async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
218
+ """
219
+ Enhances a single crate with advanced web scraping data.
220
+ Modifies the crate object in place.
221
+ """
222
+ if not self.enhanced_scraper:
223
+ return
224
+
225
+ try:
226
+ scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
227
+ if scraping_results:
228
+ self._integrate_scraping_results(crate, scraping_results)
229
+ logging.info(
230
+ f"Enhanced scraping for {crate.name}: "
231
+ f"{len(scraping_results)} sources"
232
+ )
233
+ except Exception as e:
234
+ logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
235
+
236
+ def _integrate_scraping_results(
237
+ self,
238
+ crate: CrateMetadata,
239
+ scraping_results: "Dict[str, EnhancedScrapingResult]",
240
+ ) -> None:
241
+ """
242
+ Integrates enhanced scraping results into the crate metadata.
243
+ Modifies the crate object in place.
244
+ """
245
+ crate.enhanced_scraping = {}
246
+
247
+ for source, result in scraping_results.items():
248
+ if not result or result.error:
249
+ continue
250
+
251
+ crate.enhanced_scraping[source] = {
252
+ "title": result.title,
253
+ "quality_score": result.quality_score,
254
+ "extraction_method": result.extraction_method,
255
+ "structured_data": result.structured_data,
256
+ "content_length": len(result.content),
257
+ }
258
+ # Update README if we got better content
259
+ if source == "docs_rs" and result.quality_score > 0.7:
260
+ if not crate.readme or len(result.content) > len(crate.readme):
261
+ crate.readme = result.content
262
+ logging.info(f"Updated README for {crate.name} from {source}")
263
+
264
+ # Extract additional metadata from structured data
265
+ structured_data = result.structured_data or {}
266
+ if "features" in structured_data and isinstance(
267
+ structured_data["features"], list
268
+ ):
269
+ crate.enhanced_features = structured_data["features"]
270
+ if "dependencies" in structured_data and isinstance(
271
+ structured_data["dependencies"], list
272
+ ):
273
+ crate.enhanced_dependencies = structured_data["dependencies"]
274
+ if "examples" in structured_data and isinstance(
275
+ structured_data["examples"], list
276
+ ):
277
+ crate.code_snippets.extend(structured_data["examples"])
278
+
279
+ def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
280
+ """Analyze dependencies between crates."""
211
281
  return DependencyAnalyzer.analyze_dependencies(crates)
212
282
 
213
- def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
214
- """Save processing checkpoint with status metadata"""
283
+ def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
284
+ """Saves a processing checkpoint to a file."""
215
285
  timestamp = time.strftime("%Y%m%d-%H%M%S")
216
286
  filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
217
-
287
+
218
288
  with open(filename, "w") as f:
219
289
  for item in data:
220
- # Convert to dict for serialization
221
- item_dict = item.__dict__.copy()
222
- f.write(json.dumps(item_dict) + "\n")
223
-
224
- # Save status metadata
225
- status = {
226
- "timestamp": timestamp,
227
- "total_crates": len(data),
228
- "processed_crates": sum(1 for c in data if c.use_case is not None),
229
- "advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
230
- "checkpoint_file": filename
231
- }
232
-
233
- status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
234
- with open(status_file, "w") as f:
235
- json.dump(status, f, indent=2)
236
-
290
+ f.write(json.dumps(item.to_dict()) + "\n")
291
+
237
292
  logging.info(f"Saved checkpoint to {filename}")
238
293
  return filename
239
294
 
240
- def save_final_output(self, data: List[EnrichedCrate], dependency_data: Dict):
241
- """Save final enriched data and analysis"""
295
+ def save_final_output(
296
+ self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
297
+ ) -> None:
298
+ """Saves the final enriched data and analysis reports."""
242
299
  timestamp = time.strftime("%Y%m%d-%H%M%S")
243
-
300
+
244
301
  # Save main enriched data
245
- final_output = os.path.join(self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl")
246
- with open(final_output, "w") as f:
302
+ final_output_path = os.path.join(
303
+ self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
304
+ )
305
+ with open(final_output_path, "w") as f:
247
306
  for item in data:
248
- item_dict = item.__dict__.copy()
249
- f.write(json.dumps(item_dict) + "\n")
250
-
307
+ f.write(json.dumps(item.to_dict()) + "\n")
308
+
251
309
  # Save dependency analysis
252
- dep_file = os.path.join(self.output_dir, f"dependency_analysis_{timestamp}.json")
253
- with open(dep_file, "w") as f:
310
+ dep_file_path = os.path.join(
311
+ self.output_dir, f"dependency_analysis_{timestamp}.json"
312
+ )
313
+ with open(dep_file_path, "w") as f:
254
314
  json.dump(dependency_data, f, indent=2)
255
315
 
256
- # Generate summary report
316
+ # Generate and save summary report
317
+ self._generate_summary_report(data, dependency_data, timestamp)
318
+
319
+ logging.info(f"Results saved to {self.output_dir}/")
320
+
321
+ def _generate_summary_report(
322
+ self,
323
+ data: "List[EnrichedCrate]",
324
+ dependency_data: "Dict[str, Any]",
325
+ timestamp: str,
326
+ ) -> None:
327
+ """Generates a summary report of the pipeline run."""
257
328
  summary = {
258
329
  "total_crates": len(data),
259
330
  "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
260
- "most_popular": sorted([{
261
- "name": c.name,
262
- "score": c.score or 0,
263
- "downloads": c.downloads,
264
- "github_stars": c.github_stars
265
- } for c in data], key=lambda x: x["score"], reverse=True)[:5],
266
- "most_depended_upon": dependency_data.get("most_depended", [])[:5]
331
+ "most_popular": sorted(
332
+ [
333
+ {
334
+ "name": c.name,
335
+ "score": c.score or 0,
336
+ "downloads": c.downloads,
337
+ "github_stars": c.github_stars,
338
+ }
339
+ for c in data
340
+ ],
341
+ key=lambda x: x.get("score", 0),
342
+ reverse=True,
343
+ )[:10],
344
+ "most_depended_upon": dependency_data.get("most_depended", [])[:10],
267
345
  }
268
-
269
- summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
270
- with open(summary_file, "w") as f:
271
- json.dump(summary, f, indent=2)
272
346
 
273
- logging.info(f"Results saved to {self.output_dir}/")
347
+ summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
348
+ with open(summary_path, "w") as f:
349
+ json.dump(summary, f, indent=2)
274
350
 
275
- def run(self):
276
- """Main pipeline execution flow"""
351
+ async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
352
+ """Main pipeline execution flow."""
277
353
  start_time = time.time()
354
+ if not self.crates:
355
+ logging.error("No crates to process. Exiting.")
356
+ return None
357
+
278
358
  logging.info(f"Processing {len(self.crates)} crates...")
279
-
280
- # Process in batches
281
- all_enriched = []
282
- crate_batches = [self.crates[i:i+self.config.batch_size]
283
- for i in range(0, len(self.crates), self.config.batch_size)]
284
359
 
285
- for batch_num, batch in enumerate(crate_batches):
286
- logging.info(f"Processing batch {batch_num+1}/{len(crate_batches)} ({len(batch)} crates)")
287
-
360
+ all_enriched: "List[EnrichedCrate]" = []
361
+ batch_size = self.config.batch_size
362
+ crate_batches = [
363
+ self.crates[i : i + batch_size]
364
+ for i in range(0, len(self.crates), batch_size)
365
+ ]
366
+
367
+ for i, batch_names in enumerate(crate_batches):
368
+ logging.info(
369
+ f"Processing batch {i + 1}/{len(crate_batches)} "
370
+ f"({len(batch_names)} crates)"
371
+ )
372
+
288
373
  # Fetch metadata
289
- batch_data = self.fetch_metadata_batch(batch)
290
-
374
+ metadata_batch = await self.fetch_metadata_batch(batch_names)
375
+ if not metadata_batch:
376
+ logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
377
+ continue
378
+
291
379
  # Enrich the batch
292
- enriched_batch = self.enrich_batch(batch_data)
380
+ enriched_batch = await self.enrich_batch(metadata_batch)
293
381
  all_enriched.extend(enriched_batch)
294
-
295
- # Save checkpoint after each batch
296
- self.save_checkpoint(all_enriched, "batch_checkpoint")
297
- logging.info(f"Completed batch {batch_num+1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
298
-
299
- # Optional: Add source analysis for some crates
300
- if batch_num < 2: # Only do detailed analysis for first 2 batches
301
- for crate in enriched_batch:
302
- try:
303
- crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
304
- crate.security = SecurityAnalyzer.check_security_metrics(crate)
305
- crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
306
- logging.info(f"Advanced analysis completed for {crate.name}")
307
- except Exception as e:
308
- logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
309
-
310
- # Step 3: Perform dependency analysis
382
+
383
+ # Save checkpoint
384
+ self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
385
+ logging.info(
386
+ f"Completed batch {i + 1}, "
387
+ f"processed {len(all_enriched)}/{len(self.crates)} crates"
388
+ )
389
+
390
+ # Final analysis and saving
311
391
  logging.info("Analyzing crate dependencies...")
312
392
  dependency_analysis = self.analyze_dependencies(all_enriched)
313
-
314
- # Save final results
315
393
  self.save_final_output(all_enriched, dependency_analysis)
316
394
 
317
- # Final summary
318
395
  duration = time.time() - start_time
319
396
  logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
320
-
321
397
  return all_enriched, dependency_analysis