rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +317 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +79 -47
  26. rust_crate_pipeline-1.4.2.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.2.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/top_level.txt +0 -0
@@ -4,462 +4,404 @@ import time
4
4
  import logging
5
5
  import json
6
6
  import asyncio
7
- from typing import List, Dict, Optional
7
+ from typing import Any, Union, TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import Dict, List, Optional
11
+
8
12
  from .config import PipelineConfig, CrateMetadata, EnrichedCrate
9
13
  from .network import CrateAPIClient, GitHubBatchClient
10
14
  from .ai_processing import LLMEnricher
11
- from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
15
+ from .analysis import DependencyAnalyzer
16
+ from .crate_analysis import CrateAnalyzer
17
+
18
+ # Import Azure OpenAI enricher
19
+ try:
20
+ from .azure_ai_processing import AzureOpenAIEnricher
21
+ AZURE_OPENAI_AVAILABLE = True
22
+ except ImportError:
23
+ AZURE_OPENAI_AVAILABLE = False
24
+ AzureOpenAIEnricher = None
12
25
 
13
26
  # Import enhanced scraping capabilities
14
27
  try:
15
- import sys
16
- sys.path.append(os.path.dirname(os.path.dirname(__file__)))
17
- from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
18
- enhanced_scraping_available = True
28
+ from .scraping.unified_scraper import UnifiedScraper, ScrapingResult
29
+ ENHANCED_SCRAPING_AVAILABLE = True
19
30
  except ImportError:
20
- enhanced_scraping_available = False
21
- CrateDocumentationScraper = None
22
- EnhancedScrapingResult = None
31
+ ENHANCED_SCRAPING_AVAILABLE = False
32
+ UnifiedScraper = None # type: ignore[assignment,misc]
33
+ ScrapingResult = None # type: ignore[assignment,misc]
23
34
  logging.warning("Enhanced scraping not available - using basic methods")
24
35
 
25
36
 
37
+ class CustomJSONEncoder(json.JSONEncoder):
38
+ """Custom JSON encoder to handle non-serializable objects"""
39
+ def default(self, obj):
40
+ if hasattr(obj, 'to_dict'):
41
+ return obj.to_dict()
42
+ elif hasattr(obj, '__dict__'):
43
+ return obj.__dict__
44
+ else:
45
+ return str(obj)
46
+
47
+
26
48
  class CrateDataPipeline:
27
- def __init__(self, config: PipelineConfig):
49
+ """Orchestrates the entire data collection, enrichment, and analysis pipeline."""
50
+
51
+ def __init__(self, config: PipelineConfig, crate_list: "List[str] | None" = None, **kwargs) -> None:
28
52
  self.config = config
29
53
  self.api_client = CrateAPIClient(config)
30
54
  self.github_client = GitHubBatchClient(config)
31
- self.enricher = LLMEnricher(config)
32
- self.crates = self.get_crate_list()
33
- self.output_dir = self._create_output_dir() # Initialize enhanced scraping if available
34
- self.enhanced_scraper = None
35
- if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
55
+
56
+ # Initialize the appropriate AI enricher based on configuration
57
+ if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
36
58
  try:
37
- self.enhanced_scraper = CrateDocumentationScraper(
38
- enable_crawl4ai=config.enable_crawl4ai)
39
- logging.info("✅ Enhanced scraping with Crawl4AI enabled")
59
+ self.enricher = AzureOpenAIEnricher(config)
60
+ logging.info("[OK] Using Azure OpenAI enricher")
40
61
  except Exception as e:
41
- logging.warning(
42
- f" Failed to initialize enhanced scraping: {e}")
43
- elif enhanced_scraping_available and CrateDocumentationScraper is not None:
44
- try:
45
- self.enhanced_scraper = CrateDocumentationScraper(
46
- enable_crawl4ai=True)
47
- logging.info(
48
- " Enhanced scraping with Crawl4AI enabled (default)")
49
- except Exception as e:
50
- logging.warning(
51
- f"❌ Failed to initialize enhanced scraping: {e}")
62
+ logging.warning(f"[WARN] Failed to initialize Azure OpenAI enricher: {e}")
63
+ logging.info("[INFO] Falling back to local LLM enricher")
64
+ self.enricher = LLMEnricher(config)
65
+ else:
66
+ if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
67
+ logging.warning("[WARN] Azure OpenAI requested but not available")
68
+ self.enricher = LLMEnricher(config)
69
+ logging.info("[OK] Using local LLM enricher")
70
+
71
+ # Initialize cargo analyzer
72
+ self.cargo_analyzer = CrateAnalyzer(".")
73
+
74
+ # Use provided crate_list or load from file
75
+ if crate_list:
76
+ self.crates = crate_list
77
+ logging.info(f"Using provided crate list: {len(crate_list)} crates")
78
+ else:
79
+ self.crates = self._get_crate_list()
80
+
81
+ self.output_dir = self._create_output_dir()
82
+ self.enhanced_scraper: Any = (
83
+ self._initialize_enhanced_scraper()
84
+ )
85
+
86
+ def _initialize_enhanced_scraper(self) -> Any:
87
+ """Initializes the CrateDocumentationScraper if available and enabled."""
88
+ if (
89
+ not ENHANCED_SCRAPING_AVAILABLE
90
+ or not self.config.enable_crawl4ai
91
+ or UnifiedScraper is None
92
+ ):
93
+ return None
94
+ try:
95
+ scraper = UnifiedScraper()
96
+ logging.info("[OK] Enhanced scraping with Crawl4AI enabled")
97
+ return scraper
98
+ except Exception as e:
99
+ logging.warning(f"[ERROR] Failed to initialize enhanced scraping: {e}")
100
+ return None
52
101
 
53
102
  def _create_output_dir(self) -> str:
103
+ """Creates a timestamped output directory for pipeline results."""
54
104
  timestamp = time.strftime("%Y%m%d-%H%M%S")
55
- output_dir = f"crate_data_{timestamp}"
105
+ output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
56
106
  os.makedirs(output_dir, exist_ok=True)
57
107
  return output_dir
58
108
 
59
- def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
60
- """Return a comprehensive list of all high-value crates to process"""
61
- crates = [
62
- # Web frameworks and servers
63
- "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
64
- "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
65
- "tiny_http", "httptest", "mockito", "wiremock",
66
-
67
- # Async runtimes and utilities
68
- "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
69
- "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
70
- "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
71
- "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
72
-
73
- # Serialization/deserialization
74
- "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
75
- "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
76
- "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
77
-
78
- # Error handling and debugging
79
- "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
80
- "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
81
- # Command line and terminal
82
- "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
83
- "crossterm", "termion", "console", "indicati", "dialoguer", "termcolor",
84
- "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
85
- # Utilities and general purpose
86
- "rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
87
- "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
88
- "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
89
- "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
90
- "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
91
- "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
92
- # HTTP clients and servers
93
- "reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
94
- "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
95
- "webpki", "webpki-roots",
96
-
97
- # Database and storage
98
- "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
99
- "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
100
- "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
101
- # Concurrency and parallelism
102
- "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
103
- "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
104
- "flume", "kanal", "tokio-util", "futures-concurrency",
105
- # Protocol buffers, gRPC, and messaging
106
- "prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
107
- "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
108
- # Procedural macros and metaprogramming
109
- "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
110
- "darling", "derive_builder", "strum", "strum_macros",
111
- "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
112
-
113
- # Cryptography and security
114
- "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
115
- "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
116
- "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
117
- "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
118
-
119
- # Game development and graphics
120
- "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
121
- "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
122
- "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
123
- # Networking and protocols
124
- "socket2", "mio", "polling", "async-io", "calloop", "quinn",
125
- "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
126
- "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
127
-
128
- # Text processing and parsing
129
- "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
130
- "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
131
- "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
132
-
133
- # System programming and OS interfaces
134
- "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
135
- "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
136
- "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
137
- # Testing and development tools
138
- "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
139
- "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
140
- "insta", "goldenfile", "similar", "difference", "pretty_assertions",
141
-
142
- # Configuration and environment
143
- "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
144
- "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
145
-
146
- # Logging and observability
147
- "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
148
- "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
149
- "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
150
-
151
- # Time and date
152
- "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
153
- "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
154
-
155
- # Machine Learning & AI
156
- "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
157
- "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
158
- "tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
159
- "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
160
- "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
161
- "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
162
- "onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
163
- "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
164
- "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
165
- "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
166
- "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
167
- ]
168
-
169
- if limit is not None:
170
- return crates[:limit]
171
- return crates
172
-
173
- async def fetch_metadata_batch(
174
- self,
175
- crate_names: List[str]) -> List[CrateMetadata]:
176
- """Fetch metadata for a batch of crates using asyncio-based parallel processing
109
+ def _get_crate_list(self) -> "List[str]":
110
+ """
111
+ Loads the list of crates to process from an external file.
112
+ This approach is more modular and easier to maintain than a hardcoded list.
113
+ """
114
+ crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
115
+ try:
116
+ with open(crate_list_path) as f:
117
+ crates = [line.strip() for line in f if line.strip()]
118
+ logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
119
+ if not crates:
120
+ logging.warning(f"Crate list at {crate_list_path} is empty.")
121
+ return crates
122
+ except FileNotFoundError:
123
+ logging.error(f"Crate list file not found at: {crate_list_path}")
124
+ return []
125
+
126
+ def get_crate_list(self) -> "List[str]":
127
+ """
128
+ Public method to get the list of crates.
129
+ Returns the already loaded crate list or loads it if not available.
130
+ """
131
+ if hasattr(self, "crates") and self.crates:
132
+ return self.crates
133
+ else:
134
+ return self._get_crate_list()
177
135
 
178
- Each coroutine processes completely independent crate data, ensuring safety.
179
- No shared state is modified - each coroutine only reads from self.api_client and
180
- returns independent results.
136
+ async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
137
+ """
138
+ Fetches metadata for a batch of crates using asyncio-based parallel processing.
181
139
  """
182
- results = []
183
140
 
184
- async def fetch_single_crate_safe(crate_name: str) -> Optional[CrateMetadata]:
141
+ async def fetch_single_crate_safe(
142
+ crate_name: str,
143
+ ) -> Union[CrateMetadata, None]:
185
144
  try:
186
- # If api_client has an async method, use it; otherwise, run in executor
187
- if hasattr(self.api_client, 'fetch_crate_metadata_async'):
188
- data = await self.api_client.fetch_crate_metadata_async(crate_name)
189
- else:
190
- loop = asyncio.get_running_loop()
191
- data = await loop.run_in_executor(None, self.api_client.fetch_crate_metadata, crate_name)
192
- if data:
193
- return CrateMetadata(
194
- name=data.get("name", ""),
195
- version=data.get("version", ""),
196
- description=data.get("description", ""),
197
- repository=data.get("repository", ""),
198
- keywords=data.get("keywords", []),
199
- categories=data.get("categories", []),
200
- readme=data.get("readme", ""),
201
- downloads=data.get("downloads", 0),
202
- github_stars=data.get("github_stars", 0),
203
- dependencies=data.get("dependencies", []),
204
- features=data.get("features", []),
205
- code_snippets=data.get("code_snippets", []),
206
- readme_sections=data.get("readme_sections", {}),
207
- librs_downloads=data.get("librs_downloads"),
208
- source=data.get("source", "crates.io")
209
- )
210
- return None
145
+ loop = asyncio.get_running_loop()
146
+ data = await loop.run_in_executor(
147
+ None, self.api_client.fetch_crate_metadata, crate_name
148
+ )
149
+ if not data:
150
+ return None
151
+
152
+ return CrateMetadata(
153
+ name=data.get("name", ""),
154
+ version=data.get("version", ""),
155
+ description=data.get("description", ""),
156
+ repository=data.get("repository", ""),
157
+ keywords=data.get("keywords", []),
158
+ categories=data.get("categories", []),
159
+ readme=data.get("readme", ""),
160
+ downloads=data.get("downloads", 0),
161
+ github_stars=data.get("github_stars", 0),
162
+ dependencies=data.get("dependencies", []),
163
+ features=data.get("features", {}),
164
+ code_snippets=data.get("code_snippets", []),
165
+ readme_sections=data.get("readme_sections", {}),
166
+ librs_downloads=data.get("librs_downloads"),
167
+ source=data.get("source", "crates.io"),
168
+ )
169
+
211
170
  except Exception as e:
212
- logging.error(f"Error fetching {crate_name}: {e}")
171
+ logging.error(f"Error fetching metadata for {crate_name}: {e}")
213
172
  return None
214
173
 
215
- # Use asyncio.gather for parallel async processing
216
174
  tasks = [fetch_single_crate_safe(name) for name in crate_names]
217
175
  results_raw = await asyncio.gather(*tasks)
218
- results = [r for r in results_raw if r is not None]
219
- for crate in results:
220
- logging.info(f"Fetched metadata for {crate.name}")
176
+ results = [r for r in results_raw if r]
177
+ logging.info(
178
+ f"Fetched metadata for {len(results)} out of "
179
+ f"{len(crate_names)} requested crates."
180
+ )
221
181
  return results
222
182
 
223
- # Remove the async methods that are no longer needed
224
- # async def _fetch_single_crate_async(self, crate_name: str) ->
225
- # Optional[Dict]:
226
-
227
- async def enrich_batch(
228
- self,
229
- batch: List[CrateMetadata]) -> List[EnrichedCrate]:
230
- """Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
231
- # Add GitHub stats first
183
+ async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
184
+ """Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
185
+ # Update GitHub stats
232
186
  github_repos = [
233
- c.repository for c in batch if "github.com" in c.repository]
234
- repo_stats = self.github_client.batch_get_repo_stats(github_repos)
235
-
236
- # Update crates with GitHub info
237
- for crate in batch:
238
- repo_url = crate.repository
239
- if repo_url in repo_stats:
240
- stats = repo_stats[repo_url]
241
- crate.github_stars = stats.get("stargazers_count", 0)
242
-
243
- # Enhanced scraping if available
244
- if self.enhanced_scraper:
245
- batch = asyncio.run(self._enhance_with_scraping(batch))
246
-
247
- # Now enrich with AI
248
- enriched_batch = []
249
- for crate in batch:
250
- try:
251
- enriched = self.enricher.enrich_crate(crate)
252
- enriched_batch.append(enriched)
253
- logging.info(f"Enriched {crate.name}")
254
- except Exception as e:
255
- logging.error(f"Failed to enrich {crate.name}: {str(e)}")
256
- # Add the crate with just the fields we have
257
- enriched_dict = crate.__dict__.copy()
258
- enriched_batch.append(EnrichedCrate(**enriched_dict))
259
-
260
- return enriched_batch
261
-
262
- async def _enhance_with_scraping(
263
- self, batch: List[CrateMetadata]) -> List[CrateMetadata]:
264
- """Enhance crates with advanced web scraping data"""
265
- enhanced_batch = []
266
-
267
- for crate in batch:
268
- try: # Scrape comprehensive documentation
269
- scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
270
-
271
- # Integrate scraping results into crate metadata
272
- enhanced_crate = self._integrate_scraping_results(
273
- crate, scraping_results)
274
- enhanced_batch.append(enhanced_crate)
187
+ c.repository for c in batch if c.repository and "github.com" in c.repository
188
+ ]
189
+ if github_repos:
190
+ repo_stats = self.github_client.batch_get_repo_stats(github_repos)
191
+ for crate in batch:
192
+ if crate.repository in repo_stats:
193
+ stats = repo_stats[crate.repository]
194
+ crate.github_stars = stats.get("stargazers_count", 0)
195
+
196
+ # Asynchronously enhance with scraping and AI
197
+ enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
198
+ enriched_results = await asyncio.gather(*enrichment_tasks)
199
+ return [result for result in enriched_results if result]
200
+
201
+ async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
202
+ """Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
203
+ try:
204
+ # Enhanced scraping if available
205
+ if self.enhanced_scraper:
206
+ await self._enhance_with_scraping(crate)
207
+
208
+ # Now enrich with AI
209
+ enriched = self.enricher.enrich_crate(crate)
210
+
211
+ # Add cargo analysis if we have a local crate directory
212
+ # Note: This would require downloading/cloning the crate first
213
+ # For now, we'll add a placeholder for cargo analysis
214
+ enriched.source_analysis = {
215
+ "cargo_analysis_available": False,
216
+ "note": "Cargo analysis requires local crate source code"
217
+ }
218
+
219
+ logging.info(f"Enriched {crate.name}")
220
+ return enriched
221
+ except Exception as e:
222
+ logging.error(f"Failed to enrich {crate.name}: {e}")
223
+ # Return a partially enriched crate to avoid data loss
224
+ enriched_dict = crate.to_dict()
225
+ return EnrichedCrate(**enriched_dict)
226
+
227
+ async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
228
+ """
229
+ Enhances a single crate with advanced web scraping data.
230
+ Modifies the crate object in place.
231
+ """
232
+ if not self.enhanced_scraper:
233
+ return
275
234
 
235
+ try:
236
+ scraping_results = await self.enhanced_scraper.scrape_crate_documentation(crate.name)
237
+ if scraping_results:
238
+ self._integrate_scraping_results(crate, scraping_results)
276
239
  logging.info(
277
- f"Enhanced scraping for {crate.name}: {len(scraping_results)} sources")
278
-
279
- except Exception as e:
280
- logging.warning(
281
- f"Enhanced scraping failed for {crate.name}: {e}")
282
- enhanced_batch.append(crate)
283
-
284
- return enhanced_batch
285
-
286
- def _integrate_scraping_results(self,
287
- crate: CrateMetadata,
288
- scraping_results: Dict[str,
289
- EnhancedScrapingResult]) -> CrateMetadata:
290
- """Integrate enhanced scraping results into crate metadata"""
291
- # Create a copy of the crate to avoid modifying the original
292
- enhanced_crate = CrateMetadata(**crate.__dict__)
293
-
294
- # Add enhanced scraping data
295
- enhanced_crate.enhanced_scraping = {}
240
+ f"Enhanced scraping for {crate.name}: "
241
+ f"{len(scraping_results)} sources"
242
+ )
243
+ except Exception as e:
244
+ logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
245
+
246
+ def _integrate_scraping_results(
247
+ self,
248
+ crate: CrateMetadata,
249
+ scraping_results: "Dict[str, Any]",
250
+ ) -> None:
251
+ """
252
+ Integrates enhanced scraping results into the crate metadata.
253
+ Modifies the crate object in place.
254
+ """
255
+ crate.enhanced_scraping = {}
296
256
 
297
257
  for source, result in scraping_results.items():
298
- if result.error:
258
+ if not result or result.error:
299
259
  continue
300
260
 
301
- enhanced_crate.enhanced_scraping[source] = {
302
- 'title': result.title,
303
- 'quality_score': result.quality_score,
304
- 'extraction_method': result.extraction_method,
305
- 'structured_data': result.structured_data,
306
- 'content_length': len(result.content)
307
- } # Update README if we got better content
308
- if source == 'docs_rs' and result.quality_score > 0.7:
309
- if not enhanced_crate.readme or len(
310
- result.content) > len(
311
- enhanced_crate.readme):
312
- enhanced_crate.readme = result.content
313
- logging.info(
314
- f"Updated README for {crate.name} from {source}")
261
+ crate.enhanced_scraping[source] = {
262
+ "title": result.title,
263
+ "quality_score": result.quality_score,
264
+ "extraction_method": result.extraction_method,
265
+ "structured_data": result.structured_data,
266
+ "content_length": len(result.content),
267
+ }
268
+ # Update README if we got better content
269
+ if source == "docs_rs" and result.quality_score > 0.7:
270
+ if not crate.readme or len(result.content) > len(crate.readme):
271
+ crate.readme = result.content
272
+ logging.info(f"Updated README for {crate.name} from {source}")
315
273
 
316
274
  # Extract additional metadata from structured data
317
- if result.structured_data:
318
- if 'features' in result.structured_data and isinstance(
319
- result.structured_data['features'], list):
320
- enhanced_crate.enhanced_features = result.structured_data['features']
321
-
322
- if 'dependencies' in result.structured_data and isinstance(
323
- result.structured_data['dependencies'], list):
324
- enhanced_crate.enhanced_dependencies = result.structured_data['dependencies']
325
-
326
- if 'examples' in result.structured_data and isinstance(
327
- result.structured_data['examples'], list):
328
- enhanced_crate.code_snippets.extend(
329
- result.structured_data['examples'])
330
-
331
- return enhanced_crate
332
-
333
- def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
334
- """Analyze dependencies between crates"""
275
+ structured_data = result.structured_data or {}
276
+ if "features" in structured_data and isinstance(
277
+ structured_data["features"], list
278
+ ):
279
+ crate.enhanced_features = structured_data["features"]
280
+ if "dependencies" in structured_data and isinstance(
281
+ structured_data["dependencies"], list
282
+ ):
283
+ crate.enhanced_dependencies = structured_data["dependencies"]
284
+ if "examples" in structured_data and isinstance(
285
+ structured_data["examples"], list
286
+ ):
287
+ crate.code_snippets.extend(structured_data["examples"])
288
+
289
+ def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
290
+ """Analyze dependencies between crates."""
335
291
  return DependencyAnalyzer.analyze_dependencies(crates)
336
292
 
337
- def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
338
- """Save processing checkpoint with status metadata"""
293
+ def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
294
+ """Saves a processing checkpoint to a file."""
339
295
  timestamp = time.strftime("%Y%m%d-%H%M%S")
340
296
  filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
341
297
 
342
298
  with open(filename, "w") as f:
343
299
  for item in data:
344
- # Convert to dict for serialization
345
- item_dict = item.__dict__.copy()
346
- f.write(json.dumps(item_dict) + "\n")
347
-
348
- # Save status metadata
349
- status = {
350
- "timestamp": timestamp,
351
- "total_crates": len(data),
352
- "processed_crates": sum(
353
- 1 for c in data if c.use_case is not None),
354
- "advanced_analysis": sum(
355
- 1 for c in data if c.source_analysis is not None),
356
- "checkpoint_file": filename}
357
-
358
- status_file = os.path.join(
359
- self.output_dir,
360
- f"{prefix}_status_{timestamp}.json")
361
- with open(status_file, "w") as f:
362
- json.dump(status, f, indent=2)
300
+ f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
363
301
 
364
302
  logging.info(f"Saved checkpoint to {filename}")
365
303
  return filename
366
304
 
367
305
  def save_final_output(
368
- self,
369
- data: List[EnrichedCrate],
370
- dependency_data: Dict):
371
- """Save final enriched data and analysis"""
306
+ self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
307
+ ) -> None:
308
+ """Saves the final enriched data and analysis reports."""
372
309
  timestamp = time.strftime("%Y%m%d-%H%M%S")
373
310
 
374
311
  # Save main enriched data
375
- final_output = os.path.join(
376
- self.output_dir,
377
- f"enriched_crate_metadata_{timestamp}.jsonl")
378
- with open(final_output, "w") as f:
312
+ final_output_path = os.path.join(
313
+ self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
314
+ )
315
+ with open(final_output_path, "w") as f:
379
316
  for item in data:
380
- item_dict = item.__dict__.copy()
381
- f.write(json.dumps(item_dict) + "\n")
317
+ f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
382
318
 
383
319
  # Save dependency analysis
384
- dep_file = os.path.join(
385
- self.output_dir,
386
- f"dependency_analysis_{timestamp}.json")
387
- with open(dep_file, "w") as f:
320
+ dep_file_path = os.path.join(
321
+ self.output_dir, f"dependency_analysis_{timestamp}.json"
322
+ )
323
+ with open(dep_file_path, "w") as f:
388
324
  json.dump(dependency_data, f, indent=2)
389
325
 
390
- # Generate summary report
326
+ # Generate and save summary report
327
+ self._generate_summary_report(data, dependency_data, timestamp)
328
+
329
+ logging.info(f"Results saved to {self.output_dir}/")
330
+
331
+ def _generate_summary_report(
332
+ self,
333
+ data: "List[EnrichedCrate]",
334
+ dependency_data: "Dict[str, Any]",
335
+ timestamp: str,
336
+ ) -> None:
337
+ """Generates a summary report of the pipeline run."""
391
338
  summary = {
392
339
  "total_crates": len(data),
393
340
  "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
394
- "most_popular": sorted([{
395
- "name": c.name,
396
- "score": c.score or 0,
397
- "downloads": c.downloads,
398
- "github_stars": c.github_stars
399
- } for c in data], key=lambda x: x["score"], reverse=True)[:5],
400
- "most_depended_upon": dependency_data.get("most_depended", [])[:5]
341
+ "most_popular": sorted(
342
+ [
343
+ {
344
+ "name": c.name,
345
+ "score": c.score or 0,
346
+ "downloads": c.downloads,
347
+ "github_stars": c.github_stars,
348
+ }
349
+ for c in data
350
+ ],
351
+ key=lambda x: x.get("score", 0),
352
+ reverse=True,
353
+ )[:10],
354
+ "most_depended_upon": dependency_data.get("most_depended", [])[:10],
401
355
  }
402
356
 
403
- summary_file = os.path.join(
404
- self.output_dir,
405
- f"summary_report_{timestamp}.json")
406
- with open(summary_file, "w") as f:
357
+ summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
358
+ with open(summary_path, "w") as f:
407
359
  json.dump(summary, f, indent=2)
408
360
 
409
- logging.info(f"Results saved to {self.output_dir}/")
410
-
411
- async def run(self):
412
- """Main pipeline execution flow (async)"""
361
+ async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
362
+ """Main pipeline execution flow."""
413
363
  start_time = time.time()
364
+ if not self.crates:
365
+ logging.error("No crates to process. Exiting.")
366
+ return None
367
+
414
368
  logging.info(f"Processing {len(self.crates)} crates...")
415
369
 
416
- # Process in batches
417
- all_enriched = []
418
- crate_batches = [self.crates[i:i + self.config.batch_size]
419
- for i in range(0, len(self.crates), self.config.batch_size)]
370
+ all_enriched: "List[EnrichedCrate]" = []
371
+ batch_size = self.config.batch_size
372
+ crate_batches = [
373
+ self.crates[i : i + batch_size]
374
+ for i in range(0, len(self.crates), batch_size)
375
+ ]
420
376
 
421
- for batch_num, batch in enumerate(crate_batches):
377
+ for i, batch_names in enumerate(crate_batches):
422
378
  logging.info(
423
- f"Processing batch {batch_num + 1}/{len(crate_batches)} ({len(batch)} crates)")
424
-
425
- # Fetch metadata (async)
426
- batch_data = await self.fetch_metadata_batch(batch)
379
+ f"Processing batch {i + 1}/{len(crate_batches)} "
380
+ f"({len(batch_names)} crates)"
381
+ )
382
+
383
+ # Fetch metadata
384
+ metadata_batch = await self.fetch_metadata_batch(batch_names)
385
+ if not metadata_batch:
386
+ logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
387
+ continue
427
388
 
428
- # Enrich the batch (async)
429
- enriched_batch = await self.enrich_batch(batch_data)
389
+ # Enrich the batch
390
+ enriched_batch = await self.enrich_batch(metadata_batch)
430
391
  all_enriched.extend(enriched_batch)
431
392
 
432
- # Save checkpoint after each batch
433
- self.save_checkpoint(all_enriched, "batch_checkpoint")
393
+ # Save checkpoint
394
+ self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
434
395
  logging.info(
435
- f"Completed batch {batch_num + 1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
436
-
437
- # Optional: Add source analysis for some crates
438
- if batch_num < 2: # Only do detailed analysis for first 2 batches
439
- for crate in enriched_batch:
440
- try:
441
- crate.source_analysis = SourceAnalyzer.analyze_crate_source(
442
- crate)
443
- crate.security = SecurityAnalyzer.check_security_metrics(
444
- crate)
445
- crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
446
- crate)
447
- logging.info(
448
- f"Advanced analysis completed for {crate.name}")
449
- except Exception as e:
450
- logging.warning(
451
- f"Advanced analysis failed for {crate.name}: {str(e)}")
452
-
453
- # Step 3: Perform dependency analysis
396
+ f"Completed batch {i + 1}, "
397
+ f"processed {len(all_enriched)}/{len(self.crates)} crates"
398
+ )
399
+
400
+ # Final analysis and saving
454
401
  logging.info("Analyzing crate dependencies...")
455
402
  dependency_analysis = self.analyze_dependencies(all_enriched)
456
-
457
- # Save final results
458
403
  self.save_final_output(all_enriched, dependency_analysis)
459
404
 
460
- # Final summary
461
405
  duration = time.time() - start_time
462
- logging.info(
463
- f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
464
-
406
+ logging.info(f"[OK] Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
465
407
  return all_enriched, dependency_analysis