rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -4,462 +4,387 @@ import time
4
4
  import logging
5
5
  import json
6
6
  import asyncio
7
- from typing import List, Dict, Optional
7
+ from typing import Any, Union, TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import Dict, List, Optional
11
+
8
12
  from .config import PipelineConfig, CrateMetadata, EnrichedCrate
9
13
  from .network import CrateAPIClient, GitHubBatchClient
10
14
  from .ai_processing import LLMEnricher
11
- from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
15
+ from .analysis import DependencyAnalyzer
16
+ from .crate_analysis import CrateAnalyzer
17
+
18
+ # Import Azure OpenAI enricher
19
+ try:
20
+ from .azure_ai_processing import AzureOpenAIEnricher
21
+ AZURE_OPENAI_AVAILABLE = True
22
+ except ImportError:
23
+ AZURE_OPENAI_AVAILABLE = False
24
+ AzureOpenAIEnricher = None
12
25
 
13
26
  # Import enhanced scraping capabilities
14
27
  try:
15
- import sys
16
- sys.path.append(os.path.dirname(os.path.dirname(__file__)))
17
- from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
18
- enhanced_scraping_available = True
28
+ from .scraping.unified_scraper import UnifiedScraper, ScrapingResult
29
+ ENHANCED_SCRAPING_AVAILABLE = True
19
30
  except ImportError:
20
- enhanced_scraping_available = False
21
- CrateDocumentationScraper = None
22
- EnhancedScrapingResult = None
31
+ ENHANCED_SCRAPING_AVAILABLE = False
32
+ UnifiedScraper = None # type: ignore[assignment,misc]
33
+ ScrapingResult = None # type: ignore[assignment,misc]
23
34
  logging.warning("Enhanced scraping not available - using basic methods")
24
35
 
25
36
 
26
37
  class CrateDataPipeline:
27
- def __init__(self, config: PipelineConfig):
38
+ """Orchestrates the entire data collection, enrichment, and analysis pipeline."""
39
+
40
+ def __init__(self, config: PipelineConfig) -> None:
28
41
  self.config = config
29
42
  self.api_client = CrateAPIClient(config)
30
43
  self.github_client = GitHubBatchClient(config)
31
- self.enricher = LLMEnricher(config)
32
- self.crates = self.get_crate_list()
33
- self.output_dir = self._create_output_dir() # Initialize enhanced scraping if available
34
- self.enhanced_scraper = None
35
- if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
36
- try:
37
- self.enhanced_scraper = CrateDocumentationScraper(
38
- enable_crawl4ai=config.enable_crawl4ai)
39
- logging.info("✅ Enhanced scraping with Crawl4AI enabled")
40
- except Exception as e:
41
- logging.warning(
42
- f"❌ Failed to initialize enhanced scraping: {e}")
43
- elif enhanced_scraping_available and CrateDocumentationScraper is not None:
44
+
45
+ # Initialize the appropriate AI enricher based on configuration
46
+ if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
44
47
  try:
45
- self.enhanced_scraper = CrateDocumentationScraper(
46
- enable_crawl4ai=True)
47
- logging.info(
48
- "✅ Enhanced scraping with Crawl4AI enabled (default)")
48
+ self.enricher = AzureOpenAIEnricher(config)
49
+ logging.info("[OK] Using Azure OpenAI enricher")
49
50
  except Exception as e:
50
- logging.warning(
51
- f" Failed to initialize enhanced scraping: {e}")
51
+ logging.warning(f"[WARN] Failed to initialize Azure OpenAI enricher: {e}")
52
+ logging.info("[INFO] Falling back to local LLM enricher")
53
+ self.enricher = LLMEnricher(config)
54
+ else:
55
+ if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
56
+ logging.warning("[WARN] Azure OpenAI requested but not available")
57
+ self.enricher = LLMEnricher(config)
58
+ logging.info("[OK] Using local LLM enricher")
59
+
60
+ # Initialize cargo analyzer
61
+ self.cargo_analyzer = CrateAnalyzer(".")
62
+
63
+ self.crates = self._get_crate_list()
64
+ self.output_dir = self._create_output_dir()
65
+ self.enhanced_scraper: Any = (
66
+ self._initialize_enhanced_scraper()
67
+ )
68
+
69
+ def _initialize_enhanced_scraper(self) -> Any:
70
+ """Initializes the CrateDocumentationScraper if available and enabled."""
71
+ if (
72
+ not ENHANCED_SCRAPING_AVAILABLE
73
+ or not self.config.enable_crawl4ai
74
+ or UnifiedScraper is None
75
+ ):
76
+ return None
77
+ try:
78
+ scraper = UnifiedScraper()
79
+ logging.info("[OK] Enhanced scraping with Crawl4AI enabled")
80
+ return scraper
81
+ except Exception as e:
82
+ logging.warning(f"[ERROR] Failed to initialize enhanced scraping: {e}")
83
+ return None
52
84
 
53
85
  def _create_output_dir(self) -> str:
86
+ """Creates a timestamped output directory for pipeline results."""
54
87
  timestamp = time.strftime("%Y%m%d-%H%M%S")
55
- output_dir = f"crate_data_{timestamp}"
88
+ output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
56
89
  os.makedirs(output_dir, exist_ok=True)
57
90
  return output_dir
58
91
 
59
- def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
60
- """Return a comprehensive list of all high-value crates to process"""
61
- crates = [
62
- # Web frameworks and servers
63
- "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
64
- "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
65
- "tiny_http", "httptest", "mockito", "wiremock",
66
-
67
- # Async runtimes and utilities
68
- "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
69
- "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
70
- "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
71
- "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
72
-
73
- # Serialization/deserialization
74
- "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
75
- "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
76
- "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
77
-
78
- # Error handling and debugging
79
- "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
80
- "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
81
- # Command line and terminal
82
- "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
83
- "crossterm", "termion", "console", "indicati", "dialoguer", "termcolor",
84
- "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
85
- # Utilities and general purpose
86
- "rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
87
- "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
88
- "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
89
- "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
90
- "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
91
- "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
92
- # HTTP clients and servers
93
- "reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
94
- "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
95
- "webpki", "webpki-roots",
96
-
97
- # Database and storage
98
- "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
99
- "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
100
- "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
101
- # Concurrency and parallelism
102
- "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
103
- "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
104
- "flume", "kanal", "tokio-util", "futures-concurrency",
105
- # Protocol buffers, gRPC, and messaging
106
- "prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
107
- "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
108
- # Procedural macros and metaprogramming
109
- "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
110
- "darling", "derive_builder", "strum", "strum_macros",
111
- "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
112
-
113
- # Cryptography and security
114
- "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
115
- "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
116
- "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
117
- "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
118
-
119
- # Game development and graphics
120
- "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
121
- "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
122
- "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
123
- # Networking and protocols
124
- "socket2", "mio", "polling", "async-io", "calloop", "quinn",
125
- "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
126
- "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
127
-
128
- # Text processing and parsing
129
- "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
130
- "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
131
- "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
132
-
133
- # System programming and OS interfaces
134
- "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
135
- "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
136
- "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
137
- # Testing and development tools
138
- "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
139
- "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
140
- "insta", "goldenfile", "similar", "difference", "pretty_assertions",
141
-
142
- # Configuration and environment
143
- "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
144
- "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
145
-
146
- # Logging and observability
147
- "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
148
- "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
149
- "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
150
-
151
- # Time and date
152
- "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
153
- "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
154
-
155
- # Machine Learning & AI
156
- "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
157
- "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
158
- "tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
159
- "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
160
- "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
161
- "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
162
- "onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
163
- "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
164
- "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
165
- "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
166
- "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
167
- ]
168
-
169
- if limit is not None:
170
- return crates[:limit]
171
- return crates
172
-
173
- async def fetch_metadata_batch(
174
- self,
175
- crate_names: List[str]) -> List[CrateMetadata]:
176
- """Fetch metadata for a batch of crates using asyncio-based parallel processing
92
+ def _get_crate_list(self) -> "List[str]":
93
+ """
94
+ Loads the list of crates to process from an external file.
95
+ This approach is more modular and easier to maintain than a hardcoded list.
96
+ """
97
+ crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
98
+ try:
99
+ with open(crate_list_path) as f:
100
+ crates = [line.strip() for line in f if line.strip()]
101
+ logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
102
+ if not crates:
103
+ logging.warning(f"Crate list at {crate_list_path} is empty.")
104
+ return crates
105
+ except FileNotFoundError:
106
+ logging.error(f"Crate list file not found at: {crate_list_path}")
107
+ return []
108
+
109
+ def get_crate_list(self) -> "List[str]":
110
+ """
111
+ Public method to get the list of crates.
112
+ Returns the already loaded crate list or loads it if not available.
113
+ """
114
+ if hasattr(self, "crates") and self.crates:
115
+ return self.crates
116
+ else:
117
+ return self._get_crate_list()
177
118
 
178
- Each coroutine processes completely independent crate data, ensuring safety.
179
- No shared state is modified - each coroutine only reads from self.api_client and
180
- returns independent results.
119
+ async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
120
+ """
121
+ Fetches metadata for a batch of crates using asyncio-based parallel processing.
181
122
  """
182
- results = []
183
123
 
184
- async def fetch_single_crate_safe(crate_name: str) -> Optional[CrateMetadata]:
124
+ async def fetch_single_crate_safe(
125
+ crate_name: str,
126
+ ) -> Union[CrateMetadata, None]:
185
127
  try:
186
- # If api_client has an async method, use it; otherwise, run in executor
187
- if hasattr(self.api_client, 'fetch_crate_metadata_async'):
188
- data = await self.api_client.fetch_crate_metadata_async(crate_name)
189
- else:
190
- loop = asyncio.get_running_loop()
191
- data = await loop.run_in_executor(None, self.api_client.fetch_crate_metadata, crate_name)
192
- if data:
193
- return CrateMetadata(
194
- name=data.get("name", ""),
195
- version=data.get("version", ""),
196
- description=data.get("description", ""),
197
- repository=data.get("repository", ""),
198
- keywords=data.get("keywords", []),
199
- categories=data.get("categories", []),
200
- readme=data.get("readme", ""),
201
- downloads=data.get("downloads", 0),
202
- github_stars=data.get("github_stars", 0),
203
- dependencies=data.get("dependencies", []),
204
- features=data.get("features", []),
205
- code_snippets=data.get("code_snippets", []),
206
- readme_sections=data.get("readme_sections", {}),
207
- librs_downloads=data.get("librs_downloads"),
208
- source=data.get("source", "crates.io")
209
- )
210
- return None
128
+ loop = asyncio.get_running_loop()
129
+ data = await loop.run_in_executor(
130
+ None, self.api_client.fetch_crate_metadata, crate_name
131
+ )
132
+ if not data:
133
+ return None
134
+
135
+ return CrateMetadata(
136
+ name=data.get("name", ""),
137
+ version=data.get("version", ""),
138
+ description=data.get("description", ""),
139
+ repository=data.get("repository", ""),
140
+ keywords=data.get("keywords", []),
141
+ categories=data.get("categories", []),
142
+ readme=data.get("readme", ""),
143
+ downloads=data.get("downloads", 0),
144
+ github_stars=data.get("github_stars", 0),
145
+ dependencies=data.get("dependencies", []),
146
+ features=data.get("features", {}),
147
+ code_snippets=data.get("code_snippets", []),
148
+ readme_sections=data.get("readme_sections", {}),
149
+ librs_downloads=data.get("librs_downloads"),
150
+ source=data.get("source", "crates.io"),
151
+ )
152
+
211
153
  except Exception as e:
212
- logging.error(f"Error fetching {crate_name}: {e}")
154
+ logging.error(f"Error fetching metadata for {crate_name}: {e}")
213
155
  return None
214
156
 
215
- # Use asyncio.gather for parallel async processing
216
157
  tasks = [fetch_single_crate_safe(name) for name in crate_names]
217
158
  results_raw = await asyncio.gather(*tasks)
218
- results = [r for r in results_raw if r is not None]
219
- for crate in results:
220
- logging.info(f"Fetched metadata for {crate.name}")
159
+ results = [r for r in results_raw if r]
160
+ logging.info(
161
+ f"Fetched metadata for {len(results)} out of "
162
+ f"{len(crate_names)} requested crates."
163
+ )
221
164
  return results
222
165
 
223
- # Remove the async methods that are no longer needed
224
- # async def _fetch_single_crate_async(self, crate_name: str) ->
225
- # Optional[Dict]:
226
-
227
- async def enrich_batch(
228
- self,
229
- batch: List[CrateMetadata]) -> List[EnrichedCrate]:
230
- """Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
231
- # Add GitHub stats first
166
+ async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
167
+ """Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
168
+ # Update GitHub stats
232
169
  github_repos = [
233
- c.repository for c in batch if "github.com" in c.repository]
234
- repo_stats = self.github_client.batch_get_repo_stats(github_repos)
235
-
236
- # Update crates with GitHub info
237
- for crate in batch:
238
- repo_url = crate.repository
239
- if repo_url in repo_stats:
240
- stats = repo_stats[repo_url]
241
- crate.github_stars = stats.get("stargazers_count", 0)
242
-
243
- # Enhanced scraping if available
244
- if self.enhanced_scraper:
245
- batch = asyncio.run(self._enhance_with_scraping(batch))
246
-
247
- # Now enrich with AI
248
- enriched_batch = []
249
- for crate in batch:
250
- try:
251
- enriched = self.enricher.enrich_crate(crate)
252
- enriched_batch.append(enriched)
253
- logging.info(f"Enriched {crate.name}")
254
- except Exception as e:
255
- logging.error(f"Failed to enrich {crate.name}: {str(e)}")
256
- # Add the crate with just the fields we have
257
- enriched_dict = crate.__dict__.copy()
258
- enriched_batch.append(EnrichedCrate(**enriched_dict))
259
-
260
- return enriched_batch
261
-
262
- async def _enhance_with_scraping(
263
- self, batch: List[CrateMetadata]) -> List[CrateMetadata]:
264
- """Enhance crates with advanced web scraping data"""
265
- enhanced_batch = []
266
-
267
- for crate in batch:
268
- try: # Scrape comprehensive documentation
269
- scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
270
-
271
- # Integrate scraping results into crate metadata
272
- enhanced_crate = self._integrate_scraping_results(
273
- crate, scraping_results)
274
- enhanced_batch.append(enhanced_crate)
170
+ c.repository for c in batch if c.repository and "github.com" in c.repository
171
+ ]
172
+ if github_repos:
173
+ repo_stats = self.github_client.batch_get_repo_stats(github_repos)
174
+ for crate in batch:
175
+ if crate.repository in repo_stats:
176
+ stats = repo_stats[crate.repository]
177
+ crate.github_stars = stats.get("stargazers_count", 0)
178
+
179
+ # Asynchronously enhance with scraping and AI
180
+ enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
181
+ enriched_results = await asyncio.gather(*enrichment_tasks)
182
+ return [result for result in enriched_results if result]
183
+
184
+ async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
185
+ """Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
186
+ try:
187
+ # Enhanced scraping if available
188
+ if self.enhanced_scraper:
189
+ await self._enhance_with_scraping(crate)
190
+
191
+ # Now enrich with AI
192
+ enriched = self.enricher.enrich_crate(crate)
193
+
194
+ # Add cargo analysis if we have a local crate directory
195
+ # Note: This would require downloading/cloning the crate first
196
+ # For now, we'll add a placeholder for cargo analysis
197
+ enriched.source_analysis = {
198
+ "cargo_analysis_available": False,
199
+ "note": "Cargo analysis requires local crate source code"
200
+ }
201
+
202
+ logging.info(f"Enriched {crate.name}")
203
+ return enriched
204
+ except Exception as e:
205
+ logging.error(f"Failed to enrich {crate.name}: {e}")
206
+ # Return a partially enriched crate to avoid data loss
207
+ enriched_dict = crate.to_dict()
208
+ return EnrichedCrate(**enriched_dict)
209
+
210
+ async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
211
+ """
212
+ Enhances a single crate with advanced web scraping data.
213
+ Modifies the crate object in place.
214
+ """
215
+ if not self.enhanced_scraper:
216
+ return
275
217
 
218
+ try:
219
+ scraping_results = await self.enhanced_scraper.scrape_crate_documentation(crate.name)
220
+ if scraping_results:
221
+ self._integrate_scraping_results(crate, scraping_results)
276
222
  logging.info(
277
- f"Enhanced scraping for {crate.name}: {len(scraping_results)} sources")
278
-
279
- except Exception as e:
280
- logging.warning(
281
- f"Enhanced scraping failed for {crate.name}: {e}")
282
- enhanced_batch.append(crate)
283
-
284
- return enhanced_batch
285
-
286
- def _integrate_scraping_results(self,
287
- crate: CrateMetadata,
288
- scraping_results: Dict[str,
289
- EnhancedScrapingResult]) -> CrateMetadata:
290
- """Integrate enhanced scraping results into crate metadata"""
291
- # Create a copy of the crate to avoid modifying the original
292
- enhanced_crate = CrateMetadata(**crate.__dict__)
293
-
294
- # Add enhanced scraping data
295
- enhanced_crate.enhanced_scraping = {}
223
+ f"Enhanced scraping for {crate.name}: "
224
+ f"{len(scraping_results)} sources"
225
+ )
226
+ except Exception as e:
227
+ logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
228
+
229
+ def _integrate_scraping_results(
230
+ self,
231
+ crate: CrateMetadata,
232
+ scraping_results: "Dict[str, Any]",
233
+ ) -> None:
234
+ """
235
+ Integrates enhanced scraping results into the crate metadata.
236
+ Modifies the crate object in place.
237
+ """
238
+ crate.enhanced_scraping = {}
296
239
 
297
240
  for source, result in scraping_results.items():
298
- if result.error:
241
+ if not result or result.error:
299
242
  continue
300
243
 
301
- enhanced_crate.enhanced_scraping[source] = {
302
- 'title': result.title,
303
- 'quality_score': result.quality_score,
304
- 'extraction_method': result.extraction_method,
305
- 'structured_data': result.structured_data,
306
- 'content_length': len(result.content)
307
- } # Update README if we got better content
308
- if source == 'docs_rs' and result.quality_score > 0.7:
309
- if not enhanced_crate.readme or len(
310
- result.content) > len(
311
- enhanced_crate.readme):
312
- enhanced_crate.readme = result.content
313
- logging.info(
314
- f"Updated README for {crate.name} from {source}")
244
+ crate.enhanced_scraping[source] = {
245
+ "title": result.title,
246
+ "quality_score": result.quality_score,
247
+ "extraction_method": result.extraction_method,
248
+ "structured_data": result.structured_data,
249
+ "content_length": len(result.content),
250
+ }
251
+ # Update README if we got better content
252
+ if source == "docs_rs" and result.quality_score > 0.7:
253
+ if not crate.readme or len(result.content) > len(crate.readme):
254
+ crate.readme = result.content
255
+ logging.info(f"Updated README for {crate.name} from {source}")
315
256
 
316
257
  # Extract additional metadata from structured data
317
- if result.structured_data:
318
- if 'features' in result.structured_data and isinstance(
319
- result.structured_data['features'], list):
320
- enhanced_crate.enhanced_features = result.structured_data['features']
321
-
322
- if 'dependencies' in result.structured_data and isinstance(
323
- result.structured_data['dependencies'], list):
324
- enhanced_crate.enhanced_dependencies = result.structured_data['dependencies']
325
-
326
- if 'examples' in result.structured_data and isinstance(
327
- result.structured_data['examples'], list):
328
- enhanced_crate.code_snippets.extend(
329
- result.structured_data['examples'])
330
-
331
- return enhanced_crate
332
-
333
- def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
334
- """Analyze dependencies between crates"""
258
+ structured_data = result.structured_data or {}
259
+ if "features" in structured_data and isinstance(
260
+ structured_data["features"], list
261
+ ):
262
+ crate.enhanced_features = structured_data["features"]
263
+ if "dependencies" in structured_data and isinstance(
264
+ structured_data["dependencies"], list
265
+ ):
266
+ crate.enhanced_dependencies = structured_data["dependencies"]
267
+ if "examples" in structured_data and isinstance(
268
+ structured_data["examples"], list
269
+ ):
270
+ crate.code_snippets.extend(structured_data["examples"])
271
+
272
+ def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
273
+ """Analyze dependencies between crates."""
335
274
  return DependencyAnalyzer.analyze_dependencies(crates)
336
275
 
337
- def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
338
- """Save processing checkpoint with status metadata"""
276
+ def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
277
+ """Saves a processing checkpoint to a file."""
339
278
  timestamp = time.strftime("%Y%m%d-%H%M%S")
340
279
  filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
341
280
 
342
281
  with open(filename, "w") as f:
343
282
  for item in data:
344
- # Convert to dict for serialization
345
- item_dict = item.__dict__.copy()
346
- f.write(json.dumps(item_dict) + "\n")
347
-
348
- # Save status metadata
349
- status = {
350
- "timestamp": timestamp,
351
- "total_crates": len(data),
352
- "processed_crates": sum(
353
- 1 for c in data if c.use_case is not None),
354
- "advanced_analysis": sum(
355
- 1 for c in data if c.source_analysis is not None),
356
- "checkpoint_file": filename}
357
-
358
- status_file = os.path.join(
359
- self.output_dir,
360
- f"{prefix}_status_{timestamp}.json")
361
- with open(status_file, "w") as f:
362
- json.dump(status, f, indent=2)
283
+ f.write(json.dumps(item.to_dict()) + "\n")
363
284
 
364
285
  logging.info(f"Saved checkpoint to {filename}")
365
286
  return filename
366
287
 
367
288
  def save_final_output(
368
- self,
369
- data: List[EnrichedCrate],
370
- dependency_data: Dict):
371
- """Save final enriched data and analysis"""
289
+ self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
290
+ ) -> None:
291
+ """Saves the final enriched data and analysis reports."""
372
292
  timestamp = time.strftime("%Y%m%d-%H%M%S")
373
293
 
374
294
  # Save main enriched data
375
- final_output = os.path.join(
376
- self.output_dir,
377
- f"enriched_crate_metadata_{timestamp}.jsonl")
378
- with open(final_output, "w") as f:
295
+ final_output_path = os.path.join(
296
+ self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
297
+ )
298
+ with open(final_output_path, "w") as f:
379
299
  for item in data:
380
- item_dict = item.__dict__.copy()
381
- f.write(json.dumps(item_dict) + "\n")
300
+ f.write(json.dumps(item.to_dict()) + "\n")
382
301
 
383
302
  # Save dependency analysis
384
- dep_file = os.path.join(
385
- self.output_dir,
386
- f"dependency_analysis_{timestamp}.json")
387
- with open(dep_file, "w") as f:
303
+ dep_file_path = os.path.join(
304
+ self.output_dir, f"dependency_analysis_{timestamp}.json"
305
+ )
306
+ with open(dep_file_path, "w") as f:
388
307
  json.dump(dependency_data, f, indent=2)
389
308
 
390
- # Generate summary report
309
+ # Generate and save summary report
310
+ self._generate_summary_report(data, dependency_data, timestamp)
311
+
312
+ logging.info(f"Results saved to {self.output_dir}/")
313
+
314
+ def _generate_summary_report(
315
+ self,
316
+ data: "List[EnrichedCrate]",
317
+ dependency_data: "Dict[str, Any]",
318
+ timestamp: str,
319
+ ) -> None:
320
+ """Generates a summary report of the pipeline run."""
391
321
  summary = {
392
322
  "total_crates": len(data),
393
323
  "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
394
- "most_popular": sorted([{
395
- "name": c.name,
396
- "score": c.score or 0,
397
- "downloads": c.downloads,
398
- "github_stars": c.github_stars
399
- } for c in data], key=lambda x: x["score"], reverse=True)[:5],
400
- "most_depended_upon": dependency_data.get("most_depended", [])[:5]
324
+ "most_popular": sorted(
325
+ [
326
+ {
327
+ "name": c.name,
328
+ "score": c.score or 0,
329
+ "downloads": c.downloads,
330
+ "github_stars": c.github_stars,
331
+ }
332
+ for c in data
333
+ ],
334
+ key=lambda x: x.get("score", 0),
335
+ reverse=True,
336
+ )[:10],
337
+ "most_depended_upon": dependency_data.get("most_depended", [])[:10],
401
338
  }
402
339
 
403
- summary_file = os.path.join(
404
- self.output_dir,
405
- f"summary_report_{timestamp}.json")
406
- with open(summary_file, "w") as f:
340
+ summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
341
+ with open(summary_path, "w") as f:
407
342
  json.dump(summary, f, indent=2)
408
343
 
409
- logging.info(f"Results saved to {self.output_dir}/")
410
-
411
- async def run(self):
412
- """Main pipeline execution flow (async)"""
344
+ async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
345
+ """Main pipeline execution flow."""
413
346
  start_time = time.time()
347
+ if not self.crates:
348
+ logging.error("No crates to process. Exiting.")
349
+ return None
350
+
414
351
  logging.info(f"Processing {len(self.crates)} crates...")
415
352
 
416
- # Process in batches
417
- all_enriched = []
418
- crate_batches = [self.crates[i:i + self.config.batch_size]
419
- for i in range(0, len(self.crates), self.config.batch_size)]
353
+ all_enriched: "List[EnrichedCrate]" = []
354
+ batch_size = self.config.batch_size
355
+ crate_batches = [
356
+ self.crates[i : i + batch_size]
357
+ for i in range(0, len(self.crates), batch_size)
358
+ ]
420
359
 
421
- for batch_num, batch in enumerate(crate_batches):
360
+ for i, batch_names in enumerate(crate_batches):
422
361
  logging.info(
423
- f"Processing batch {batch_num + 1}/{len(crate_batches)} ({len(batch)} crates)")
424
-
425
- # Fetch metadata (async)
426
- batch_data = await self.fetch_metadata_batch(batch)
362
+ f"Processing batch {i + 1}/{len(crate_batches)} "
363
+ f"({len(batch_names)} crates)"
364
+ )
365
+
366
+ # Fetch metadata
367
+ metadata_batch = await self.fetch_metadata_batch(batch_names)
368
+ if not metadata_batch:
369
+ logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
370
+ continue
427
371
 
428
- # Enrich the batch (async)
429
- enriched_batch = await self.enrich_batch(batch_data)
372
+ # Enrich the batch
373
+ enriched_batch = await self.enrich_batch(metadata_batch)
430
374
  all_enriched.extend(enriched_batch)
431
375
 
432
- # Save checkpoint after each batch
433
- self.save_checkpoint(all_enriched, "batch_checkpoint")
376
+ # Save checkpoint
377
+ self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
434
378
  logging.info(
435
- f"Completed batch {batch_num + 1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
436
-
437
- # Optional: Add source analysis for some crates
438
- if batch_num < 2: # Only do detailed analysis for first 2 batches
439
- for crate in enriched_batch:
440
- try:
441
- crate.source_analysis = SourceAnalyzer.analyze_crate_source(
442
- crate)
443
- crate.security = SecurityAnalyzer.check_security_metrics(
444
- crate)
445
- crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
446
- crate)
447
- logging.info(
448
- f"Advanced analysis completed for {crate.name}")
449
- except Exception as e:
450
- logging.warning(
451
- f"Advanced analysis failed for {crate.name}: {str(e)}")
452
-
453
- # Step 3: Perform dependency analysis
379
+ f"Completed batch {i + 1}, "
380
+ f"processed {len(all_enriched)}/{len(self.crates)} crates"
381
+ )
382
+
383
+ # Final analysis and saving
454
384
  logging.info("Analyzing crate dependencies...")
455
385
  dependency_analysis = self.analyze_dependencies(all_enriched)
456
-
457
- # Save final results
458
386
  self.save_final_output(all_enriched, dependency_analysis)
459
387
 
460
- # Final summary
461
388
  duration = time.time() - start_time
462
- logging.info(
463
- f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
464
-
389
+ logging.info(f"[OK] Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
465
390
  return all_enriched, dependency_analysis