rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +23 -3
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +107 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +47 -2
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
- rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/pipeline.py
CHANGED
@@ -3,14 +3,26 @@ import os
|
|
3
3
|
import time
|
4
4
|
import logging
|
5
5
|
import json
|
6
|
-
|
7
|
-
from tqdm import tqdm
|
6
|
+
import asyncio
|
8
7
|
from typing import List, Dict, Optional
|
9
8
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
10
9
|
from .network import CrateAPIClient, GitHubBatchClient
|
11
10
|
from .ai_processing import LLMEnricher
|
12
11
|
from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
|
13
12
|
|
13
|
+
# Import enhanced scraping capabilities
|
14
|
+
try:
|
15
|
+
import sys
|
16
|
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
17
|
+
from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
|
18
|
+
enhanced_scraping_available = True
|
19
|
+
except ImportError:
|
20
|
+
enhanced_scraping_available = False
|
21
|
+
CrateDocumentationScraper = None
|
22
|
+
EnhancedScrapingResult = None
|
23
|
+
logging.warning("Enhanced scraping not available - using basic methods")
|
24
|
+
|
25
|
+
|
14
26
|
class CrateDataPipeline:
|
15
27
|
def __init__(self, config: PipelineConfig):
|
16
28
|
self.config = config
|
@@ -18,8 +30,26 @@ class CrateDataPipeline:
|
|
18
30
|
self.github_client = GitHubBatchClient(config)
|
19
31
|
self.enricher = LLMEnricher(config)
|
20
32
|
self.crates = self.get_crate_list()
|
21
|
-
self.output_dir = self._create_output_dir()
|
22
|
-
|
33
|
+
self.output_dir = self._create_output_dir() # Initialize enhanced scraping if available
|
34
|
+
self.enhanced_scraper = None
|
35
|
+
if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
|
36
|
+
try:
|
37
|
+
self.enhanced_scraper = CrateDocumentationScraper(
|
38
|
+
enable_crawl4ai=config.enable_crawl4ai)
|
39
|
+
logging.info("✅ Enhanced scraping with Crawl4AI enabled")
|
40
|
+
except Exception as e:
|
41
|
+
logging.warning(
|
42
|
+
f"❌ Failed to initialize enhanced scraping: {e}")
|
43
|
+
elif enhanced_scraping_available and CrateDocumentationScraper is not None:
|
44
|
+
try:
|
45
|
+
self.enhanced_scraper = CrateDocumentationScraper(
|
46
|
+
enable_crawl4ai=True)
|
47
|
+
logging.info(
|
48
|
+
"✅ Enhanced scraping with Crawl4AI enabled (default)")
|
49
|
+
except Exception as e:
|
50
|
+
logging.warning(
|
51
|
+
f"❌ Failed to initialize enhanced scraping: {e}")
|
52
|
+
|
23
53
|
def _create_output_dir(self) -> str:
|
24
54
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
25
55
|
output_dir = f"crate_data_{timestamp}"
|
@@ -30,167 +60,190 @@ class CrateDataPipeline:
|
|
30
60
|
"""Return a comprehensive list of all high-value crates to process"""
|
31
61
|
crates = [
|
32
62
|
# Web frameworks and servers
|
33
|
-
"actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
|
63
|
+
"actix-web", "rocket", "axum", "warp", "tower", "tide", "gotham", "iron",
|
34
64
|
"nickel", "rouille", "thruster", "poem", "salvo", "viz", "ntex", "may-minihttp",
|
35
65
|
"tiny_http", "httptest", "mockito", "wiremock",
|
36
|
-
|
66
|
+
|
37
67
|
# Async runtimes and utilities
|
38
|
-
"tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
|
68
|
+
"tokio", "tokio-stream", "async-trait", "futures", "async-std", "smol",
|
39
69
|
"embassy", "embassy-executor", "embassy-time", "embassy-sync", "async-channel",
|
40
70
|
"async-broadcast", "async-lock", "async-once", "async-recursion", "futures-util",
|
41
71
|
"futures-channel", "futures-timer", "futures-test", "pin-project", "pin-project-lite",
|
42
|
-
|
72
|
+
|
43
73
|
# Serialization/deserialization
|
44
74
|
"serde", "serde_json", "serde_yaml", "bincode", "toml", "ron", "postcard",
|
45
75
|
"ciborium", "rmp-serde", "quick-xml", "roxmltree", "serde_cbor", "serde_derive",
|
46
76
|
"serde_repr", "serde_with", "serde_bytes", "flexbuffers", "bson", "avro-rs",
|
47
|
-
|
77
|
+
|
48
78
|
# Error handling and debugging
|
49
79
|
"anyhow", "thiserror", "eyre", "color-eyre", "miette", "fehler", "snafu",
|
50
80
|
"failure", "quick-error", "derive_more", "displaydoc", "backtrace", "better-panic",
|
51
|
-
|
81
|
+
# Command line and terminal
|
52
82
|
"clap", "structopt", "argh", "gumdrop", "docopt", "getopts", "pico-args",
|
53
|
-
"crossterm", "termion", "console", "
|
83
|
+
"crossterm", "termion", "console", "indicati", "dialoguer", "termcolor",
|
54
84
|
"colored", "yansi", "owo-colors", "nu-ansi-term", "terminal_size",
|
55
|
-
|
56
|
-
"rand", "uuid", "itertools", "num", "cfg-
|
85
|
+
# Utilities and general purpose
|
86
|
+
"rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
|
57
87
|
"form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
|
58
|
-
"walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
|
59
|
-
"ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
|
88
|
+
"walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
|
89
|
+
"ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
|
60
90
|
"getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
|
61
91
|
"unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
|
62
|
-
|
63
|
-
"reqwest", "hyper", "
|
92
|
+
# HTTP clients and servers
|
93
|
+
"reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
|
64
94
|
"http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
|
65
95
|
"webpki", "webpki-roots",
|
66
|
-
|
96
|
+
|
67
97
|
# Database and storage
|
68
98
|
"sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
|
69
99
|
"tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
|
70
100
|
"sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
|
71
|
-
|
101
|
+
# Concurrency and parallelism
|
72
102
|
"rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
|
73
103
|
"crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
|
74
104
|
"flume", "kanal", "tokio-util", "futures-concurrency",
|
75
|
-
|
76
|
-
"prost", "tonic", "
|
105
|
+
# Protocol buffers, gRPC, and messaging
|
106
|
+
"prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
|
77
107
|
"zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
|
78
|
-
|
108
|
+
# Procedural macros and metaprogramming
|
79
109
|
"syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
|
80
110
|
"darling", "derive_builder", "strum", "strum_macros",
|
81
111
|
"enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
|
82
|
-
|
112
|
+
|
83
113
|
# Cryptography and security
|
84
114
|
"ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
|
85
115
|
"hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
|
86
116
|
"aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
|
87
117
|
"secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
|
88
|
-
|
118
|
+
|
89
119
|
# Game development and graphics
|
90
120
|
"bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
|
91
121
|
"three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
|
92
122
|
"image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
|
93
|
-
|
123
|
+
# Networking and protocols
|
94
124
|
"socket2", "mio", "polling", "async-io", "calloop", "quinn",
|
95
125
|
"rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
|
96
126
|
"websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
|
97
|
-
|
127
|
+
|
98
128
|
# Text processing and parsing
|
99
129
|
"regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
|
100
130
|
"lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
|
101
131
|
"pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
|
102
|
-
|
132
|
+
|
103
133
|
# System programming and OS interfaces
|
104
134
|
"libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
|
105
135
|
"notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
|
106
136
|
"fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
|
107
|
-
|
137
|
+
# Testing and development tools
|
108
138
|
"criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
|
109
139
|
"httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
|
110
140
|
"insta", "goldenfile", "similar", "difference", "pretty_assertions",
|
111
|
-
|
141
|
+
|
112
142
|
# Configuration and environment
|
113
143
|
"config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
|
114
144
|
"etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
|
115
|
-
|
145
|
+
|
116
146
|
# Logging and observability
|
117
147
|
"log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
|
118
148
|
"tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
|
119
149
|
"log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
|
120
|
-
|
150
|
+
|
121
151
|
# Time and date
|
122
152
|
"chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
|
123
153
|
"cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
|
124
|
-
|
154
|
+
|
125
155
|
# Machine Learning & AI
|
126
|
-
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
127
|
-
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
128
|
-
"tract-
|
129
|
-
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
130
|
-
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
156
|
+
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
157
|
+
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
158
|
+
"tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
|
159
|
+
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
160
|
+
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
131
161
|
"candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
|
132
|
-
"onnxruntime", "onnxruntime-sys", "onnx-
|
133
|
-
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
|
134
|
-
"
|
135
|
-
"
|
136
|
-
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
162
|
+
"onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
|
163
|
+
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
|
164
|
+
"genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
|
165
|
+
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
137
166
|
"toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
|
138
167
|
]
|
139
|
-
|
168
|
+
|
140
169
|
if limit is not None:
|
141
170
|
return crates[:limit]
|
142
171
|
return crates
|
143
172
|
|
144
|
-
def fetch_metadata_batch(
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
)
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
173
|
+
async def fetch_metadata_batch(
|
174
|
+
self,
|
175
|
+
crate_names: List[str]) -> List[CrateMetadata]:
|
176
|
+
"""Fetch metadata for a batch of crates using asyncio-based parallel processing
|
177
|
+
|
178
|
+
Each coroutine processes completely independent crate data, ensuring safety.
|
179
|
+
No shared state is modified - each coroutine only reads from self.api_client and
|
180
|
+
returns independent results.
|
181
|
+
"""
|
182
|
+
results = []
|
183
|
+
|
184
|
+
async def fetch_single_crate_safe(crate_name: str) -> Optional[CrateMetadata]:
|
185
|
+
try:
|
186
|
+
# If api_client has an async method, use it; otherwise, run in executor
|
187
|
+
if hasattr(self.api_client, 'fetch_crate_metadata_async'):
|
188
|
+
data = await self.api_client.fetch_crate_metadata_async(crate_name)
|
189
|
+
else:
|
190
|
+
loop = asyncio.get_running_loop()
|
191
|
+
data = await loop.run_in_executor(None, self.api_client.fetch_crate_metadata, crate_name)
|
192
|
+
if data:
|
193
|
+
return CrateMetadata(
|
194
|
+
name=data.get("name", ""),
|
195
|
+
version=data.get("version", ""),
|
196
|
+
description=data.get("description", ""),
|
197
|
+
repository=data.get("repository", ""),
|
198
|
+
keywords=data.get("keywords", []),
|
199
|
+
categories=data.get("categories", []),
|
200
|
+
readme=data.get("readme", ""),
|
201
|
+
downloads=data.get("downloads", 0),
|
202
|
+
github_stars=data.get("github_stars", 0),
|
203
|
+
dependencies=data.get("dependencies", []),
|
204
|
+
features=data.get("features", []),
|
205
|
+
code_snippets=data.get("code_snippets", []),
|
206
|
+
readme_sections=data.get("readme_sections", {}),
|
207
|
+
librs_downloads=data.get("librs_downloads"),
|
208
|
+
source=data.get("source", "crates.io")
|
209
|
+
)
|
210
|
+
return None
|
211
|
+
except Exception as e:
|
212
|
+
logging.error(f"Error fetching {crate_name}: {e}")
|
213
|
+
return None
|
214
|
+
|
215
|
+
# Use asyncio.gather for parallel async processing
|
216
|
+
tasks = [fetch_single_crate_safe(name) for name in crate_names]
|
217
|
+
results_raw = await asyncio.gather(*tasks)
|
218
|
+
results = [r for r in results_raw if r is not None]
|
219
|
+
for crate in results:
|
220
|
+
logging.info(f"Fetched metadata for {crate.name}")
|
221
|
+
return results
|
222
|
+
|
223
|
+
# Remove the async methods that are no longer needed
|
224
|
+
# async def _fetch_single_crate_async(self, crate_name: str) ->
|
225
|
+
# Optional[Dict]:
|
226
|
+
|
227
|
+
async def enrich_batch(
|
228
|
+
self,
|
229
|
+
batch: List[CrateMetadata]) -> List[EnrichedCrate]:
|
230
|
+
"""Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
|
183
231
|
# Add GitHub stats first
|
184
|
-
github_repos = [
|
232
|
+
github_repos = [
|
233
|
+
c.repository for c in batch if "github.com" in c.repository]
|
185
234
|
repo_stats = self.github_client.batch_get_repo_stats(github_repos)
|
186
|
-
|
235
|
+
|
187
236
|
# Update crates with GitHub info
|
188
237
|
for crate in batch:
|
189
238
|
repo_url = crate.repository
|
190
239
|
if repo_url in repo_stats:
|
191
240
|
stats = repo_stats[repo_url]
|
192
241
|
crate.github_stars = stats.get("stargazers_count", 0)
|
193
|
-
|
242
|
+
|
243
|
+
# Enhanced scraping if available
|
244
|
+
if self.enhanced_scraper:
|
245
|
+
batch = asyncio.run(self._enhance_with_scraping(batch))
|
246
|
+
|
194
247
|
# Now enrich with AI
|
195
248
|
enriched_batch = []
|
196
249
|
for crate in batch:
|
@@ -203,9 +256,80 @@ class CrateDataPipeline:
|
|
203
256
|
# Add the crate with just the fields we have
|
204
257
|
enriched_dict = crate.__dict__.copy()
|
205
258
|
enriched_batch.append(EnrichedCrate(**enriched_dict))
|
206
|
-
|
259
|
+
|
207
260
|
return enriched_batch
|
208
261
|
|
262
|
+
async def _enhance_with_scraping(
|
263
|
+
self, batch: List[CrateMetadata]) -> List[CrateMetadata]:
|
264
|
+
"""Enhance crates with advanced web scraping data"""
|
265
|
+
enhanced_batch = []
|
266
|
+
|
267
|
+
for crate in batch:
|
268
|
+
try: # Scrape comprehensive documentation
|
269
|
+
scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
|
270
|
+
|
271
|
+
# Integrate scraping results into crate metadata
|
272
|
+
enhanced_crate = self._integrate_scraping_results(
|
273
|
+
crate, scraping_results)
|
274
|
+
enhanced_batch.append(enhanced_crate)
|
275
|
+
|
276
|
+
logging.info(
|
277
|
+
f"Enhanced scraping for {crate.name}: {len(scraping_results)} sources")
|
278
|
+
|
279
|
+
except Exception as e:
|
280
|
+
logging.warning(
|
281
|
+
f"Enhanced scraping failed for {crate.name}: {e}")
|
282
|
+
enhanced_batch.append(crate)
|
283
|
+
|
284
|
+
return enhanced_batch
|
285
|
+
|
286
|
+
def _integrate_scraping_results(self,
|
287
|
+
crate: CrateMetadata,
|
288
|
+
scraping_results: Dict[str,
|
289
|
+
EnhancedScrapingResult]) -> CrateMetadata:
|
290
|
+
"""Integrate enhanced scraping results into crate metadata"""
|
291
|
+
# Create a copy of the crate to avoid modifying the original
|
292
|
+
enhanced_crate = CrateMetadata(**crate.__dict__)
|
293
|
+
|
294
|
+
# Add enhanced scraping data
|
295
|
+
enhanced_crate.enhanced_scraping = {}
|
296
|
+
|
297
|
+
for source, result in scraping_results.items():
|
298
|
+
if result.error:
|
299
|
+
continue
|
300
|
+
|
301
|
+
enhanced_crate.enhanced_scraping[source] = {
|
302
|
+
'title': result.title,
|
303
|
+
'quality_score': result.quality_score,
|
304
|
+
'extraction_method': result.extraction_method,
|
305
|
+
'structured_data': result.structured_data,
|
306
|
+
'content_length': len(result.content)
|
307
|
+
} # Update README if we got better content
|
308
|
+
if source == 'docs_rs' and result.quality_score > 0.7:
|
309
|
+
if not enhanced_crate.readme or len(
|
310
|
+
result.content) > len(
|
311
|
+
enhanced_crate.readme):
|
312
|
+
enhanced_crate.readme = result.content
|
313
|
+
logging.info(
|
314
|
+
f"Updated README for {crate.name} from {source}")
|
315
|
+
|
316
|
+
# Extract additional metadata from structured data
|
317
|
+
if result.structured_data:
|
318
|
+
if 'features' in result.structured_data and isinstance(
|
319
|
+
result.structured_data['features'], list):
|
320
|
+
enhanced_crate.enhanced_features = result.structured_data['features']
|
321
|
+
|
322
|
+
if 'dependencies' in result.structured_data and isinstance(
|
323
|
+
result.structured_data['dependencies'], list):
|
324
|
+
enhanced_crate.enhanced_dependencies = result.structured_data['dependencies']
|
325
|
+
|
326
|
+
if 'examples' in result.structured_data and isinstance(
|
327
|
+
result.structured_data['examples'], list):
|
328
|
+
enhanced_crate.code_snippets.extend(
|
329
|
+
result.structured_data['examples'])
|
330
|
+
|
331
|
+
return enhanced_crate
|
332
|
+
|
209
333
|
def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
|
210
334
|
"""Analyze dependencies between crates"""
|
211
335
|
return DependencyAnalyzer.analyze_dependencies(crates)
|
@@ -214,42 +338,52 @@ class CrateDataPipeline:
|
|
214
338
|
"""Save processing checkpoint with status metadata"""
|
215
339
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
216
340
|
filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
|
217
|
-
|
341
|
+
|
218
342
|
with open(filename, "w") as f:
|
219
343
|
for item in data:
|
220
344
|
# Convert to dict for serialization
|
221
345
|
item_dict = item.__dict__.copy()
|
222
346
|
f.write(json.dumps(item_dict) + "\n")
|
223
|
-
|
347
|
+
|
224
348
|
# Save status metadata
|
225
349
|
status = {
|
226
350
|
"timestamp": timestamp,
|
227
351
|
"total_crates": len(data),
|
228
|
-
"processed_crates": sum(
|
229
|
-
|
230
|
-
"
|
231
|
-
|
232
|
-
|
233
|
-
|
352
|
+
"processed_crates": sum(
|
353
|
+
1 for c in data if c.use_case is not None),
|
354
|
+
"advanced_analysis": sum(
|
355
|
+
1 for c in data if c.source_analysis is not None),
|
356
|
+
"checkpoint_file": filename}
|
357
|
+
|
358
|
+
status_file = os.path.join(
|
359
|
+
self.output_dir,
|
360
|
+
f"{prefix}_status_{timestamp}.json")
|
234
361
|
with open(status_file, "w") as f:
|
235
362
|
json.dump(status, f, indent=2)
|
236
|
-
|
363
|
+
|
237
364
|
logging.info(f"Saved checkpoint to {filename}")
|
238
365
|
return filename
|
239
366
|
|
240
|
-
def save_final_output(
|
367
|
+
def save_final_output(
|
368
|
+
self,
|
369
|
+
data: List[EnrichedCrate],
|
370
|
+
dependency_data: Dict):
|
241
371
|
"""Save final enriched data and analysis"""
|
242
372
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
243
|
-
|
373
|
+
|
244
374
|
# Save main enriched data
|
245
|
-
final_output = os.path.join(
|
375
|
+
final_output = os.path.join(
|
376
|
+
self.output_dir,
|
377
|
+
f"enriched_crate_metadata_{timestamp}.jsonl")
|
246
378
|
with open(final_output, "w") as f:
|
247
379
|
for item in data:
|
248
380
|
item_dict = item.__dict__.copy()
|
249
381
|
f.write(json.dumps(item_dict) + "\n")
|
250
|
-
|
382
|
+
|
251
383
|
# Save dependency analysis
|
252
|
-
dep_file = os.path.join(
|
384
|
+
dep_file = os.path.join(
|
385
|
+
self.output_dir,
|
386
|
+
f"dependency_analysis_{timestamp}.json")
|
253
387
|
with open(dep_file, "w") as f:
|
254
388
|
json.dump(dependency_data, f, indent=2)
|
255
389
|
|
@@ -265,57 +399,67 @@ class CrateDataPipeline:
|
|
265
399
|
} for c in data], key=lambda x: x["score"], reverse=True)[:5],
|
266
400
|
"most_depended_upon": dependency_data.get("most_depended", [])[:5]
|
267
401
|
}
|
268
|
-
|
269
|
-
summary_file = os.path.join(
|
402
|
+
|
403
|
+
summary_file = os.path.join(
|
404
|
+
self.output_dir,
|
405
|
+
f"summary_report_{timestamp}.json")
|
270
406
|
with open(summary_file, "w") as f:
|
271
407
|
json.dump(summary, f, indent=2)
|
272
408
|
|
273
409
|
logging.info(f"Results saved to {self.output_dir}/")
|
274
410
|
|
275
|
-
def run(self):
|
276
|
-
"""Main pipeline execution flow"""
|
411
|
+
async def run(self):
|
412
|
+
"""Main pipeline execution flow (async)"""
|
277
413
|
start_time = time.time()
|
278
414
|
logging.info(f"Processing {len(self.crates)} crates...")
|
279
|
-
|
415
|
+
|
280
416
|
# Process in batches
|
281
417
|
all_enriched = []
|
282
|
-
crate_batches = [self.crates[i:i+self.config.batch_size]
|
418
|
+
crate_batches = [self.crates[i:i + self.config.batch_size]
|
283
419
|
for i in range(0, len(self.crates), self.config.batch_size)]
|
284
420
|
|
285
421
|
for batch_num, batch in enumerate(crate_batches):
|
286
|
-
logging.info(
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
422
|
+
logging.info(
|
423
|
+
f"Processing batch {batch_num + 1}/{len(crate_batches)} ({len(batch)} crates)")
|
424
|
+
|
425
|
+
# Fetch metadata (async)
|
426
|
+
batch_data = await self.fetch_metadata_batch(batch)
|
427
|
+
|
428
|
+
# Enrich the batch (async)
|
429
|
+
enriched_batch = await self.enrich_batch(batch_data)
|
293
430
|
all_enriched.extend(enriched_batch)
|
294
|
-
|
431
|
+
|
295
432
|
# Save checkpoint after each batch
|
296
433
|
self.save_checkpoint(all_enriched, "batch_checkpoint")
|
297
|
-
logging.info(
|
298
|
-
|
434
|
+
logging.info(
|
435
|
+
f"Completed batch {batch_num + 1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
|
436
|
+
|
299
437
|
# Optional: Add source analysis for some crates
|
300
438
|
if batch_num < 2: # Only do detailed analysis for first 2 batches
|
301
439
|
for crate in enriched_batch:
|
302
440
|
try:
|
303
|
-
crate.source_analysis = SourceAnalyzer.analyze_crate_source(
|
304
|
-
|
305
|
-
crate.
|
306
|
-
|
441
|
+
crate.source_analysis = SourceAnalyzer.analyze_crate_source(
|
442
|
+
crate)
|
443
|
+
crate.security = SecurityAnalyzer.check_security_metrics(
|
444
|
+
crate)
|
445
|
+
crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
|
446
|
+
crate)
|
447
|
+
logging.info(
|
448
|
+
f"Advanced analysis completed for {crate.name}")
|
307
449
|
except Exception as e:
|
308
|
-
logging.warning(
|
309
|
-
|
450
|
+
logging.warning(
|
451
|
+
f"Advanced analysis failed for {crate.name}: {str(e)}")
|
452
|
+
|
310
453
|
# Step 3: Perform dependency analysis
|
311
454
|
logging.info("Analyzing crate dependencies...")
|
312
455
|
dependency_analysis = self.analyze_dependencies(all_enriched)
|
313
|
-
|
456
|
+
|
314
457
|
# Save final results
|
315
458
|
self.save_final_output(all_enriched, dependency_analysis)
|
316
459
|
|
317
460
|
# Final summary
|
318
461
|
duration = time.time() - start_time
|
319
|
-
logging.info(
|
320
|
-
|
462
|
+
logging.info(
|
463
|
+
f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
464
|
+
|
321
465
|
return all_enriched, dependency_analysis
|
@@ -9,59 +9,65 @@ import logging
|
|
9
9
|
import os
|
10
10
|
|
11
11
|
# Production logging configuration
|
12
|
+
|
13
|
+
|
12
14
|
def configure_production_logging():
|
13
15
|
"""Configure logging for production to reduce verbose warnings"""
|
14
|
-
|
16
|
+
|
15
17
|
# Don't use basicConfig here - let main.py handle it
|
16
18
|
# Just set specific loggers to less verbose levels
|
17
19
|
logging.getLogger('requests').setLevel(logging.WARNING)
|
18
20
|
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
19
21
|
logging.getLogger('requests_cache').setLevel(logging.WARNING)
|
20
|
-
|
22
|
+
|
21
23
|
# If PRODUCTION environment variable is set, be even quieter
|
22
24
|
if os.getenv('PRODUCTION', 'false').lower() == 'true':
|
23
25
|
logging.getLogger().setLevel(logging.WARNING)
|
24
26
|
logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
|
25
27
|
|
28
|
+
|
26
29
|
# Production-optimized settings
|
27
30
|
PRODUCTION_SETTINGS = {
|
28
31
|
# Reduced retries to minimize warnings
|
29
32
|
'max_retries': 2,
|
30
33
|
'validation_retries': 2,
|
31
|
-
|
34
|
+
|
32
35
|
# GitHub API management
|
33
36
|
'github_rate_limit_threshold': 100,
|
34
37
|
'github_critical_threshold': 50,
|
35
|
-
|
36
|
-
# LLM settings
|
38
|
+
|
39
|
+
# LLM settings
|
37
40
|
'llm_timeout': 30,
|
38
41
|
'llm_max_attempts': 2,
|
39
|
-
|
42
|
+
|
40
43
|
# Logging preferences
|
41
44
|
'quiet_mode': True,
|
42
45
|
'log_level': 'INFO',
|
43
|
-
|
46
|
+
|
44
47
|
# Performance settings
|
45
48
|
'batch_size': 10,
|
46
49
|
'checkpoint_interval': 10,
|
47
50
|
'cache_ttl': 3600,
|
48
51
|
}
|
49
52
|
|
53
|
+
|
50
54
|
def get_production_config():
|
51
55
|
"""Get production configuration dictionary"""
|
52
56
|
return PRODUCTION_SETTINGS.copy()
|
53
57
|
|
58
|
+
|
54
59
|
def is_production():
|
55
60
|
"""Check if running in production mode"""
|
56
61
|
return os.getenv('PRODUCTION', 'false').lower() == 'true'
|
57
62
|
|
63
|
+
|
58
64
|
def setup_production_environment():
|
59
65
|
"""Set up the complete production environment"""
|
60
66
|
configure_production_logging()
|
61
|
-
|
67
|
+
|
62
68
|
# Set environment variables for quieter operation
|
63
69
|
os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
|
64
|
-
|
70
|
+
|
65
71
|
if is_production():
|
66
72
|
print("🚀 Production mode enabled - optimized for minimal warnings")
|
67
73
|
return get_production_config()
|