rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,14 +3,26 @@ import os
3
3
  import time
4
4
  import logging
5
5
  import json
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from tqdm import tqdm
6
+ import asyncio
8
7
  from typing import List, Dict, Optional
9
8
  from .config import PipelineConfig, CrateMetadata, EnrichedCrate
10
9
  from .network import CrateAPIClient, GitHubBatchClient
11
10
  from .ai_processing import LLMEnricher
12
11
  from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
13
12
 
13
+ # Import enhanced scraping capabilities
14
+ try:
15
+ import sys
16
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
17
+ from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
18
+ enhanced_scraping_available = True
19
+ except ImportError:
20
+ enhanced_scraping_available = False
21
+ CrateDocumentationScraper = None
22
+ EnhancedScrapingResult = None
23
+ logging.warning("Enhanced scraping not available - using basic methods")
24
+
25
+
14
26
  class CrateDataPipeline:
15
27
  def __init__(self, config: PipelineConfig):
16
28
  self.config = config
@@ -18,8 +30,26 @@ class CrateDataPipeline:
18
30
  self.github_client = GitHubBatchClient(config)
19
31
  self.enricher = LLMEnricher(config)
20
32
  self.crates = self.get_crate_list()
21
- self.output_dir = self._create_output_dir()
22
-
33
+ self.output_dir = self._create_output_dir() # Initialize enhanced scraping if available
34
+ self.enhanced_scraper = None
35
+ if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
36
+ try:
37
+ self.enhanced_scraper = CrateDocumentationScraper(
38
+ enable_crawl4ai=config.enable_crawl4ai)
39
+ logging.info("✅ Enhanced scraping with Crawl4AI enabled")
40
+ except Exception as e:
41
+ logging.warning(
42
+ f"❌ Failed to initialize enhanced scraping: {e}")
43
+ elif enhanced_scraping_available and CrateDocumentationScraper is not None:
44
+ try:
45
+ self.enhanced_scraper = CrateDocumentationScraper(
46
+ enable_crawl4ai=True)
47
+ logging.info(
48
+ "✅ Enhanced scraping with Crawl4AI enabled (default)")
49
+ except Exception as e:
50
+ logging.warning(
51
+ f"❌ Failed to initialize enhanced scraping: {e}")
52
+
23
53
  def _create_output_dir(self) -> str:
24
54
  timestamp = time.strftime("%Y%m%d-%H%M%S")
25
55
  output_dir = f"crate_data_{timestamp}"
@@ -30,167 +60,190 @@ class CrateDataPipeline:
30
60
  """Return a comprehensive list of all high-value crates to process"""
31
61
  crates = [
32
62
  # Web frameworks and servers
33
- "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
63
+ "actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
34
64
  "nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
35
65
  "tiny_http", "httptest", "mockito", "wiremock",
36
-
66
+
37
67
  # Async runtimes and utilities
38
- "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
68
+ "tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
39
69
  "embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
40
70
  "async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
41
71
  "futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
42
-
72
+
43
73
  # Serialization/deserialization
44
74
  "serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
45
75
  "ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
46
76
  "serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
47
-
77
+
48
78
  # Error handling and debugging
49
79
  "anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
50
80
  "failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
51
- # Command line and terminal
81
+ # Command line and terminal
52
82
  "clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
53
- "crossterm", "termion", "console", "indicatif", "dialoguer", "termcolor",
83
+ "crossterm", "termion", "console", "indicati", "dialoguer", "termcolor",
54
84
  "colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
55
- # Utilities and general purpose
56
- "rand", "uuid", "itertools", "num", "cfg-if", "bytes", "mime",
85
+ # Utilities and general purpose
86
+ "rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
57
87
  "form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
58
- "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
59
- "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
88
+ "walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
89
+ "ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
60
90
  "getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
61
91
  "unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
62
- # HTTP clients and servers
63
- "reqwest", "hyper", "surf", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
92
+ # HTTP clients and servers
93
+ "reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
64
94
  "http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
65
95
  "webpki", "webpki-roots",
66
-
96
+
67
97
  # Database and storage
68
98
  "sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
69
99
  "tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
70
100
  "sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
71
- # Concurrency and parallelism
101
+ # Concurrency and parallelism
72
102
  "rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
73
103
  "crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
74
104
  "flume", "kanal", "tokio-util", "futures-concurrency",
75
- # Protocol buffers, gRPC, and messaging
76
- "prost", "tonic", "protobuf", "grpcio", "tarpc", "capnp", "rmp",
105
+ # Protocol buffers, gRPC, and messaging
106
+ "prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
77
107
  "zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
78
- # Procedural macros and metaprogramming
108
+ # Procedural macros and metaprogramming
79
109
  "syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
80
110
  "darling", "derive_builder", "strum", "strum_macros",
81
111
  "enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
82
-
112
+
83
113
  # Cryptography and security
84
114
  "ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
85
115
  "hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
86
116
  "aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
87
117
  "secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
88
-
118
+
89
119
  # Game development and graphics
90
120
  "bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
91
121
  "three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
92
122
  "image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
93
- # Networking and protocols
123
+ # Networking and protocols
94
124
  "socket2", "mio", "polling", "async-io", "calloop", "quinn",
95
125
  "rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
96
126
  "websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
97
-
127
+
98
128
  # Text processing and parsing
99
129
  "regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
100
130
  "lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
101
131
  "pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
102
-
132
+
103
133
  # System programming and OS interfaces
104
134
  "libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
105
135
  "notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
106
136
  "fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
107
- # Testing and development tools
137
+ # Testing and development tools
108
138
  "criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
109
139
  "httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
110
140
  "insta", "goldenfile", "similar", "difference", "pretty_assertions",
111
-
141
+
112
142
  # Configuration and environment
113
143
  "config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
114
144
  "etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
115
-
145
+
116
146
  # Logging and observability
117
147
  "log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
118
148
  "tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
119
149
  "log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
120
-
150
+
121
151
  # Time and date
122
152
  "chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
123
153
  "cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
124
-
154
+
125
155
  # Machine Learning & AI
126
- "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
127
- "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
128
- "tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
129
- "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
130
- "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
156
+ "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
157
+ "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
158
+ "tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
159
+ "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
160
+ "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
131
161
  "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
132
- "onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
133
- "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
134
- "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
135
- "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
136
- "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
162
+ "onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
163
+ "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
164
+ "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
165
+ "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
137
166
  "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
138
167
  ]
139
-
168
+
140
169
  if limit is not None:
141
170
  return crates[:limit]
142
171
  return crates
143
172
 
144
- def fetch_metadata_batch(self, crate_names: List[str]) -> List[CrateMetadata]:
145
- """Fetch metadata for a batch of crates in parallel"""
146
- with ThreadPoolExecutor(max_workers=self.config.n_workers) as executor:
147
- futures = {executor.submit(self.api_client.fetch_crate_metadata, name): name
148
- for name in crate_names}
149
-
150
- results = []
151
- for future in as_completed(futures):
152
- crate_name = futures[future]
153
- try:
154
- data = future.result()
155
- if data:
156
- # Convert dict to CrateMetadata
157
- crate_metadata = CrateMetadata(
158
- name=data.get("name", ""),
159
- version=data.get("version", ""),
160
- description=data.get("description", ""),
161
- repository=data.get("repository", ""),
162
- keywords=data.get("keywords", []),
163
- categories=data.get("categories", []),
164
- readme=data.get("readme", ""),
165
- downloads=data.get("downloads", 0),
166
- github_stars=data.get("github_stars", 0),
167
- dependencies=data.get("dependencies", []),
168
- features=data.get("features", []),
169
- code_snippets=data.get("code_snippets", []),
170
- readme_sections=data.get("readme_sections", {}),
171
- librs_downloads=data.get("librs_downloads"),
172
- source=data.get("source", "crates.io")
173
- )
174
- results.append(crate_metadata)
175
- logging.info(f"Fetched metadata for {crate_name}")
176
- except Exception as e:
177
- logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
178
-
179
- return results
180
-
181
- def enrich_batch(self, batch: List[CrateMetadata]) -> List[EnrichedCrate]:
182
- """Enrich a batch of crates with GitHub stats and AI"""
173
+ async def fetch_metadata_batch(
174
+ self,
175
+ crate_names: List[str]) -> List[CrateMetadata]:
176
+ """Fetch metadata for a batch of crates using asyncio-based parallel processing
177
+
178
+ Each coroutine processes completely independent crate data, ensuring safety.
179
+ No shared state is modified - each coroutine only reads from self.api_client and
180
+ returns independent results.
181
+ """
182
+ results = []
183
+
184
+ async def fetch_single_crate_safe(crate_name: str) -> Optional[CrateMetadata]:
185
+ try:
186
+ # If api_client has an async method, use it; otherwise, run in executor
187
+ if hasattr(self.api_client, 'fetch_crate_metadata_async'):
188
+ data = await self.api_client.fetch_crate_metadata_async(crate_name)
189
+ else:
190
+ loop = asyncio.get_running_loop()
191
+ data = await loop.run_in_executor(None, self.api_client.fetch_crate_metadata, crate_name)
192
+ if data:
193
+ return CrateMetadata(
194
+ name=data.get("name", ""),
195
+ version=data.get("version", ""),
196
+ description=data.get("description", ""),
197
+ repository=data.get("repository", ""),
198
+ keywords=data.get("keywords", []),
199
+ categories=data.get("categories", []),
200
+ readme=data.get("readme", ""),
201
+ downloads=data.get("downloads", 0),
202
+ github_stars=data.get("github_stars", 0),
203
+ dependencies=data.get("dependencies", []),
204
+ features=data.get("features", []),
205
+ code_snippets=data.get("code_snippets", []),
206
+ readme_sections=data.get("readme_sections", {}),
207
+ librs_downloads=data.get("librs_downloads"),
208
+ source=data.get("source", "crates.io")
209
+ )
210
+ return None
211
+ except Exception as e:
212
+ logging.error(f"Error fetching {crate_name}: {e}")
213
+ return None
214
+
215
+ # Use asyncio.gather for parallel async processing
216
+ tasks = [fetch_single_crate_safe(name) for name in crate_names]
217
+ results_raw = await asyncio.gather(*tasks)
218
+ results = [r for r in results_raw if r is not None]
219
+ for crate in results:
220
+ logging.info(f"Fetched metadata for {crate.name}")
221
+ return results
222
+
223
+ # Remove the async methods that are no longer needed
224
+ # async def _fetch_single_crate_async(self, crate_name: str) ->
225
+ # Optional[Dict]:
226
+
227
+ async def enrich_batch(
228
+ self,
229
+ batch: List[CrateMetadata]) -> List[EnrichedCrate]:
230
+ """Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
183
231
  # Add GitHub stats first
184
- github_repos = [c.repository for c in batch if "github.com" in c.repository]
232
+ github_repos = [
233
+ c.repository for c in batch if "github.com" in c.repository]
185
234
  repo_stats = self.github_client.batch_get_repo_stats(github_repos)
186
-
235
+
187
236
  # Update crates with GitHub info
188
237
  for crate in batch:
189
238
  repo_url = crate.repository
190
239
  if repo_url in repo_stats:
191
240
  stats = repo_stats[repo_url]
192
241
  crate.github_stars = stats.get("stargazers_count", 0)
193
-
242
+
243
+ # Enhanced scraping if available
244
+ if self.enhanced_scraper:
245
+ batch = asyncio.run(self._enhance_with_scraping(batch))
246
+
194
247
  # Now enrich with AI
195
248
  enriched_batch = []
196
249
  for crate in batch:
@@ -203,9 +256,80 @@ class CrateDataPipeline:
203
256
  # Add the crate with just the fields we have
204
257
  enriched_dict = crate.__dict__.copy()
205
258
  enriched_batch.append(EnrichedCrate(**enriched_dict))
206
-
259
+
207
260
  return enriched_batch
208
261
 
262
+ async def _enhance_with_scraping(
263
+ self, batch: List[CrateMetadata]) -> List[CrateMetadata]:
264
+ """Enhance crates with advanced web scraping data"""
265
+ enhanced_batch = []
266
+
267
+ for crate in batch:
268
+ try: # Scrape comprehensive documentation
269
+ scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
270
+
271
+ # Integrate scraping results into crate metadata
272
+ enhanced_crate = self._integrate_scraping_results(
273
+ crate, scraping_results)
274
+ enhanced_batch.append(enhanced_crate)
275
+
276
+ logging.info(
277
+ f"Enhanced scraping for {crate.name}: {len(scraping_results)} sources")
278
+
279
+ except Exception as e:
280
+ logging.warning(
281
+ f"Enhanced scraping failed for {crate.name}: {e}")
282
+ enhanced_batch.append(crate)
283
+
284
+ return enhanced_batch
285
+
286
+ def _integrate_scraping_results(self,
287
+ crate: CrateMetadata,
288
+ scraping_results: Dict[str,
289
+ EnhancedScrapingResult]) -> CrateMetadata:
290
+ """Integrate enhanced scraping results into crate metadata"""
291
+ # Create a copy of the crate to avoid modifying the original
292
+ enhanced_crate = CrateMetadata(**crate.__dict__)
293
+
294
+ # Add enhanced scraping data
295
+ enhanced_crate.enhanced_scraping = {}
296
+
297
+ for source, result in scraping_results.items():
298
+ if result.error:
299
+ continue
300
+
301
+ enhanced_crate.enhanced_scraping[source] = {
302
+ 'title': result.title,
303
+ 'quality_score': result.quality_score,
304
+ 'extraction_method': result.extraction_method,
305
+ 'structured_data': result.structured_data,
306
+ 'content_length': len(result.content)
307
+ } # Update README if we got better content
308
+ if source == 'docs_rs' and result.quality_score > 0.7:
309
+ if not enhanced_crate.readme or len(
310
+ result.content) > len(
311
+ enhanced_crate.readme):
312
+ enhanced_crate.readme = result.content
313
+ logging.info(
314
+ f"Updated README for {crate.name} from {source}")
315
+
316
+ # Extract additional metadata from structured data
317
+ if result.structured_data:
318
+ if 'features' in result.structured_data and isinstance(
319
+ result.structured_data['features'], list):
320
+ enhanced_crate.enhanced_features = result.structured_data['features']
321
+
322
+ if 'dependencies' in result.structured_data and isinstance(
323
+ result.structured_data['dependencies'], list):
324
+ enhanced_crate.enhanced_dependencies = result.structured_data['dependencies']
325
+
326
+ if 'examples' in result.structured_data and isinstance(
327
+ result.structured_data['examples'], list):
328
+ enhanced_crate.code_snippets.extend(
329
+ result.structured_data['examples'])
330
+
331
+ return enhanced_crate
332
+
209
333
  def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
210
334
  """Analyze dependencies between crates"""
211
335
  return DependencyAnalyzer.analyze_dependencies(crates)
@@ -214,42 +338,52 @@ class CrateDataPipeline:
214
338
  """Save processing checkpoint with status metadata"""
215
339
  timestamp = time.strftime("%Y%m%d-%H%M%S")
216
340
  filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
217
-
341
+
218
342
  with open(filename, "w") as f:
219
343
  for item in data:
220
344
  # Convert to dict for serialization
221
345
  item_dict = item.__dict__.copy()
222
346
  f.write(json.dumps(item_dict) + "\n")
223
-
347
+
224
348
  # Save status metadata
225
349
  status = {
226
350
  "timestamp": timestamp,
227
351
  "total_crates": len(data),
228
- "processed_crates": sum(1 for c in data if c.use_case is not None),
229
- "advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
230
- "checkpoint_file": filename
231
- }
232
-
233
- status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
352
+ "processed_crates": sum(
353
+ 1 for c in data if c.use_case is not None),
354
+ "advanced_analysis": sum(
355
+ 1 for c in data if c.source_analysis is not None),
356
+ "checkpoint_file": filename}
357
+
358
+ status_file = os.path.join(
359
+ self.output_dir,
360
+ f"{prefix}_status_{timestamp}.json")
234
361
  with open(status_file, "w") as f:
235
362
  json.dump(status, f, indent=2)
236
-
363
+
237
364
  logging.info(f"Saved checkpoint to {filename}")
238
365
  return filename
239
366
 
240
- def save_final_output(self, data: List[EnrichedCrate], dependency_data: Dict):
367
+ def save_final_output(
368
+ self,
369
+ data: List[EnrichedCrate],
370
+ dependency_data: Dict):
241
371
  """Save final enriched data and analysis"""
242
372
  timestamp = time.strftime("%Y%m%d-%H%M%S")
243
-
373
+
244
374
  # Save main enriched data
245
- final_output = os.path.join(self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl")
375
+ final_output = os.path.join(
376
+ self.output_dir,
377
+ f"enriched_crate_metadata_{timestamp}.jsonl")
246
378
  with open(final_output, "w") as f:
247
379
  for item in data:
248
380
  item_dict = item.__dict__.copy()
249
381
  f.write(json.dumps(item_dict) + "\n")
250
-
382
+
251
383
  # Save dependency analysis
252
- dep_file = os.path.join(self.output_dir, f"dependency_analysis_{timestamp}.json")
384
+ dep_file = os.path.join(
385
+ self.output_dir,
386
+ f"dependency_analysis_{timestamp}.json")
253
387
  with open(dep_file, "w") as f:
254
388
  json.dump(dependency_data, f, indent=2)
255
389
 
@@ -265,57 +399,67 @@ class CrateDataPipeline:
265
399
  } for c in data], key=lambda x: x["score"], reverse=True)[:5],
266
400
  "most_depended_upon": dependency_data.get("most_depended", [])[:5]
267
401
  }
268
-
269
- summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
402
+
403
+ summary_file = os.path.join(
404
+ self.output_dir,
405
+ f"summary_report_{timestamp}.json")
270
406
  with open(summary_file, "w") as f:
271
407
  json.dump(summary, f, indent=2)
272
408
 
273
409
  logging.info(f"Results saved to {self.output_dir}/")
274
410
 
275
- def run(self):
276
- """Main pipeline execution flow"""
411
+ async def run(self):
412
+ """Main pipeline execution flow (async)"""
277
413
  start_time = time.time()
278
414
  logging.info(f"Processing {len(self.crates)} crates...")
279
-
415
+
280
416
  # Process in batches
281
417
  all_enriched = []
282
- crate_batches = [self.crates[i:i+self.config.batch_size]
418
+ crate_batches = [self.crates[i:i + self.config.batch_size]
283
419
  for i in range(0, len(self.crates), self.config.batch_size)]
284
420
 
285
421
  for batch_num, batch in enumerate(crate_batches):
286
- logging.info(f"Processing batch {batch_num+1}/{len(crate_batches)} ({len(batch)} crates)")
287
-
288
- # Fetch metadata
289
- batch_data = self.fetch_metadata_batch(batch)
290
-
291
- # Enrich the batch
292
- enriched_batch = self.enrich_batch(batch_data)
422
+ logging.info(
423
+ f"Processing batch {batch_num + 1}/{len(crate_batches)} ({len(batch)} crates)")
424
+
425
+ # Fetch metadata (async)
426
+ batch_data = await self.fetch_metadata_batch(batch)
427
+
428
+ # Enrich the batch (async)
429
+ enriched_batch = await self.enrich_batch(batch_data)
293
430
  all_enriched.extend(enriched_batch)
294
-
431
+
295
432
  # Save checkpoint after each batch
296
433
  self.save_checkpoint(all_enriched, "batch_checkpoint")
297
- logging.info(f"Completed batch {batch_num+1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
298
-
434
+ logging.info(
435
+ f"Completed batch {batch_num + 1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
436
+
299
437
  # Optional: Add source analysis for some crates
300
438
  if batch_num < 2: # Only do detailed analysis for first 2 batches
301
439
  for crate in enriched_batch:
302
440
  try:
303
- crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
304
- crate.security = SecurityAnalyzer.check_security_metrics(crate)
305
- crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
306
- logging.info(f"Advanced analysis completed for {crate.name}")
441
+ crate.source_analysis = SourceAnalyzer.analyze_crate_source(
442
+ crate)
443
+ crate.security = SecurityAnalyzer.check_security_metrics(
444
+ crate)
445
+ crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
446
+ crate)
447
+ logging.info(
448
+ f"Advanced analysis completed for {crate.name}")
307
449
  except Exception as e:
308
- logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
309
-
450
+ logging.warning(
451
+ f"Advanced analysis failed for {crate.name}: {str(e)}")
452
+
310
453
  # Step 3: Perform dependency analysis
311
454
  logging.info("Analyzing crate dependencies...")
312
455
  dependency_analysis = self.analyze_dependencies(all_enriched)
313
-
456
+
314
457
  # Save final results
315
458
  self.save_final_output(all_enriched, dependency_analysis)
316
459
 
317
460
  # Final summary
318
461
  duration = time.time() - start_time
319
- logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
320
-
462
+ logging.info(
463
+ f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
464
+
321
465
  return all_enriched, dependency_analysis
@@ -9,59 +9,65 @@ import logging
9
9
  import os
10
10
 
11
11
  # Production logging configuration
12
+
13
+
12
14
  def configure_production_logging():
13
15
  """Configure logging for production to reduce verbose warnings"""
14
-
16
+
15
17
  # Don't use basicConfig here - let main.py handle it
16
18
  # Just set specific loggers to less verbose levels
17
19
  logging.getLogger('requests').setLevel(logging.WARNING)
18
20
  logging.getLogger('urllib3').setLevel(logging.WARNING)
19
21
  logging.getLogger('requests_cache').setLevel(logging.WARNING)
20
-
22
+
21
23
  # If PRODUCTION environment variable is set, be even quieter
22
24
  if os.getenv('PRODUCTION', 'false').lower() == 'true':
23
25
  logging.getLogger().setLevel(logging.WARNING)
24
26
  logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
25
27
 
28
+
26
29
  # Production-optimized settings
27
30
  PRODUCTION_SETTINGS = {
28
31
  # Reduced retries to minimize warnings
29
32
  'max_retries': 2,
30
33
  'validation_retries': 2,
31
-
34
+
32
35
  # GitHub API management
33
36
  'github_rate_limit_threshold': 100,
34
37
  'github_critical_threshold': 50,
35
-
36
- # LLM settings
38
+
39
+ # LLM settings
37
40
  'llm_timeout': 30,
38
41
  'llm_max_attempts': 2,
39
-
42
+
40
43
  # Logging preferences
41
44
  'quiet_mode': True,
42
45
  'log_level': 'INFO',
43
-
46
+
44
47
  # Performance settings
45
48
  'batch_size': 10,
46
49
  'checkpoint_interval': 10,
47
50
  'cache_ttl': 3600,
48
51
  }
49
52
 
53
+
50
54
  def get_production_config():
51
55
  """Get production configuration dictionary"""
52
56
  return PRODUCTION_SETTINGS.copy()
53
57
 
58
+
54
59
  def is_production():
55
60
  """Check if running in production mode"""
56
61
  return os.getenv('PRODUCTION', 'false').lower() == 'true'
57
62
 
63
+
58
64
  def setup_production_environment():
59
65
  """Set up the complete production environment"""
60
66
  configure_production_logging()
61
-
67
+
62
68
  # Set environment variables for quieter operation
63
69
  os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
64
-
70
+
65
71
  if is_production():
66
72
  print("🚀 Production mode enabled - optimized for minimal warnings")
67
73
  return get_production_config()