rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/pipeline.py
CHANGED
@@ -4,462 +4,387 @@ import time
|
|
4
4
|
import logging
|
5
5
|
import json
|
6
6
|
import asyncio
|
7
|
-
from typing import
|
7
|
+
from typing import Any, Union, TYPE_CHECKING
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from typing import Dict, List, Optional
|
11
|
+
|
8
12
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
9
13
|
from .network import CrateAPIClient, GitHubBatchClient
|
10
14
|
from .ai_processing import LLMEnricher
|
11
|
-
from .analysis import
|
15
|
+
from .analysis import DependencyAnalyzer
|
16
|
+
from .crate_analysis import CrateAnalyzer
|
17
|
+
|
18
|
+
# Import Azure OpenAI enricher
|
19
|
+
try:
|
20
|
+
from .azure_ai_processing import AzureOpenAIEnricher
|
21
|
+
AZURE_OPENAI_AVAILABLE = True
|
22
|
+
except ImportError:
|
23
|
+
AZURE_OPENAI_AVAILABLE = False
|
24
|
+
AzureOpenAIEnricher = None
|
12
25
|
|
13
26
|
# Import enhanced scraping capabilities
|
14
27
|
try:
|
15
|
-
import
|
16
|
-
|
17
|
-
from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
|
18
|
-
enhanced_scraping_available = True
|
28
|
+
from .scraping.unified_scraper import UnifiedScraper, ScrapingResult
|
29
|
+
ENHANCED_SCRAPING_AVAILABLE = True
|
19
30
|
except ImportError:
|
20
|
-
|
21
|
-
|
22
|
-
|
31
|
+
ENHANCED_SCRAPING_AVAILABLE = False
|
32
|
+
UnifiedScraper = None # type: ignore[assignment,misc]
|
33
|
+
ScrapingResult = None # type: ignore[assignment,misc]
|
23
34
|
logging.warning("Enhanced scraping not available - using basic methods")
|
24
35
|
|
25
36
|
|
26
37
|
class CrateDataPipeline:
|
27
|
-
|
38
|
+
"""Orchestrates the entire data collection, enrichment, and analysis pipeline."""
|
39
|
+
|
40
|
+
def __init__(self, config: PipelineConfig) -> None:
|
28
41
|
self.config = config
|
29
42
|
self.api_client = CrateAPIClient(config)
|
30
43
|
self.github_client = GitHubBatchClient(config)
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
self.enhanced_scraper = None
|
35
|
-
if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
|
36
|
-
try:
|
37
|
-
self.enhanced_scraper = CrateDocumentationScraper(
|
38
|
-
enable_crawl4ai=config.enable_crawl4ai)
|
39
|
-
logging.info("✅ Enhanced scraping with Crawl4AI enabled")
|
40
|
-
except Exception as e:
|
41
|
-
logging.warning(
|
42
|
-
f"❌ Failed to initialize enhanced scraping: {e}")
|
43
|
-
elif enhanced_scraping_available and CrateDocumentationScraper is not None:
|
44
|
+
|
45
|
+
# Initialize the appropriate AI enricher based on configuration
|
46
|
+
if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
|
44
47
|
try:
|
45
|
-
self.
|
46
|
-
|
47
|
-
logging.info(
|
48
|
-
"✅ Enhanced scraping with Crawl4AI enabled (default)")
|
48
|
+
self.enricher = AzureOpenAIEnricher(config)
|
49
|
+
logging.info("[OK] Using Azure OpenAI enricher")
|
49
50
|
except Exception as e:
|
50
|
-
logging.warning(
|
51
|
-
|
51
|
+
logging.warning(f"[WARN] Failed to initialize Azure OpenAI enricher: {e}")
|
52
|
+
logging.info("[INFO] Falling back to local LLM enricher")
|
53
|
+
self.enricher = LLMEnricher(config)
|
54
|
+
else:
|
55
|
+
if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
|
56
|
+
logging.warning("[WARN] Azure OpenAI requested but not available")
|
57
|
+
self.enricher = LLMEnricher(config)
|
58
|
+
logging.info("[OK] Using local LLM enricher")
|
59
|
+
|
60
|
+
# Initialize cargo analyzer
|
61
|
+
self.cargo_analyzer = CrateAnalyzer(".")
|
62
|
+
|
63
|
+
self.crates = self._get_crate_list()
|
64
|
+
self.output_dir = self._create_output_dir()
|
65
|
+
self.enhanced_scraper: Any = (
|
66
|
+
self._initialize_enhanced_scraper()
|
67
|
+
)
|
68
|
+
|
69
|
+
def _initialize_enhanced_scraper(self) -> Any:
|
70
|
+
"""Initializes the CrateDocumentationScraper if available and enabled."""
|
71
|
+
if (
|
72
|
+
not ENHANCED_SCRAPING_AVAILABLE
|
73
|
+
or not self.config.enable_crawl4ai
|
74
|
+
or UnifiedScraper is None
|
75
|
+
):
|
76
|
+
return None
|
77
|
+
try:
|
78
|
+
scraper = UnifiedScraper()
|
79
|
+
logging.info("[OK] Enhanced scraping with Crawl4AI enabled")
|
80
|
+
return scraper
|
81
|
+
except Exception as e:
|
82
|
+
logging.warning(f"[ERROR] Failed to initialize enhanced scraping: {e}")
|
83
|
+
return None
|
52
84
|
|
53
85
|
def _create_output_dir(self) -> str:
|
86
|
+
"""Creates a timestamped output directory for pipeline results."""
|
54
87
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
55
|
-
output_dir = f"crate_data_{timestamp}"
|
88
|
+
output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
|
56
89
|
os.makedirs(output_dir, exist_ok=True)
|
57
90
|
return output_dir
|
58
91
|
|
59
|
-
def
|
60
|
-
"""
|
61
|
-
crates
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
"
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
# Utilities and general purpose
|
86
|
-
"rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
|
87
|
-
"form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
|
88
|
-
"walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
|
89
|
-
"ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
|
90
|
-
"getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
|
91
|
-
"unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
|
92
|
-
# HTTP clients and servers
|
93
|
-
"reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
|
94
|
-
"http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
|
95
|
-
"webpki", "webpki-roots",
|
96
|
-
|
97
|
-
# Database and storage
|
98
|
-
"sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
|
99
|
-
"tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
|
100
|
-
"sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
|
101
|
-
# Concurrency and parallelism
|
102
|
-
"rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
|
103
|
-
"crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
|
104
|
-
"flume", "kanal", "tokio-util", "futures-concurrency",
|
105
|
-
# Protocol buffers, gRPC, and messaging
|
106
|
-
"prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
|
107
|
-
"zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
|
108
|
-
# Procedural macros and metaprogramming
|
109
|
-
"syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
|
110
|
-
"darling", "derive_builder", "strum", "strum_macros",
|
111
|
-
"enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
|
112
|
-
|
113
|
-
# Cryptography and security
|
114
|
-
"ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
|
115
|
-
"hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
|
116
|
-
"aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
|
117
|
-
"secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
|
118
|
-
|
119
|
-
# Game development and graphics
|
120
|
-
"bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
|
121
|
-
"three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
|
122
|
-
"image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
|
123
|
-
# Networking and protocols
|
124
|
-
"socket2", "mio", "polling", "async-io", "calloop", "quinn",
|
125
|
-
"rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
|
126
|
-
"websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
|
127
|
-
|
128
|
-
# Text processing and parsing
|
129
|
-
"regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
|
130
|
-
"lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
|
131
|
-
"pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
|
132
|
-
|
133
|
-
# System programming and OS interfaces
|
134
|
-
"libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
|
135
|
-
"notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
|
136
|
-
"fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
|
137
|
-
# Testing and development tools
|
138
|
-
"criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
|
139
|
-
"httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
|
140
|
-
"insta", "goldenfile", "similar", "difference", "pretty_assertions",
|
141
|
-
|
142
|
-
# Configuration and environment
|
143
|
-
"config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
|
144
|
-
"etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
|
145
|
-
|
146
|
-
# Logging and observability
|
147
|
-
"log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
|
148
|
-
"tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
|
149
|
-
"log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
|
150
|
-
|
151
|
-
# Time and date
|
152
|
-
"chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
|
153
|
-
"cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
|
154
|
-
|
155
|
-
# Machine Learning & AI
|
156
|
-
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
157
|
-
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
158
|
-
"tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
|
159
|
-
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
160
|
-
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
161
|
-
"candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
|
162
|
-
"onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
|
163
|
-
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
|
164
|
-
"genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
|
165
|
-
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
166
|
-
"toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
|
167
|
-
]
|
168
|
-
|
169
|
-
if limit is not None:
|
170
|
-
return crates[:limit]
|
171
|
-
return crates
|
172
|
-
|
173
|
-
async def fetch_metadata_batch(
|
174
|
-
self,
|
175
|
-
crate_names: List[str]) -> List[CrateMetadata]:
|
176
|
-
"""Fetch metadata for a batch of crates using asyncio-based parallel processing
|
92
|
+
def _get_crate_list(self) -> "List[str]":
|
93
|
+
"""
|
94
|
+
Loads the list of crates to process from an external file.
|
95
|
+
This approach is more modular and easier to maintain than a hardcoded list.
|
96
|
+
"""
|
97
|
+
crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
|
98
|
+
try:
|
99
|
+
with open(crate_list_path) as f:
|
100
|
+
crates = [line.strip() for line in f if line.strip()]
|
101
|
+
logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
|
102
|
+
if not crates:
|
103
|
+
logging.warning(f"Crate list at {crate_list_path} is empty.")
|
104
|
+
return crates
|
105
|
+
except FileNotFoundError:
|
106
|
+
logging.error(f"Crate list file not found at: {crate_list_path}")
|
107
|
+
return []
|
108
|
+
|
109
|
+
def get_crate_list(self) -> "List[str]":
|
110
|
+
"""
|
111
|
+
Public method to get the list of crates.
|
112
|
+
Returns the already loaded crate list or loads it if not available.
|
113
|
+
"""
|
114
|
+
if hasattr(self, "crates") and self.crates:
|
115
|
+
return self.crates
|
116
|
+
else:
|
117
|
+
return self._get_crate_list()
|
177
118
|
|
178
|
-
|
179
|
-
|
180
|
-
|
119
|
+
async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
|
120
|
+
"""
|
121
|
+
Fetches metadata for a batch of crates using asyncio-based parallel processing.
|
181
122
|
"""
|
182
|
-
results = []
|
183
123
|
|
184
|
-
async def fetch_single_crate_safe(
|
124
|
+
async def fetch_single_crate_safe(
|
125
|
+
crate_name: str,
|
126
|
+
) -> Union[CrateMetadata, None]:
|
185
127
|
try:
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
128
|
+
loop = asyncio.get_running_loop()
|
129
|
+
data = await loop.run_in_executor(
|
130
|
+
None, self.api_client.fetch_crate_metadata, crate_name
|
131
|
+
)
|
132
|
+
if not data:
|
133
|
+
return None
|
134
|
+
|
135
|
+
return CrateMetadata(
|
136
|
+
name=data.get("name", ""),
|
137
|
+
version=data.get("version", ""),
|
138
|
+
description=data.get("description", ""),
|
139
|
+
repository=data.get("repository", ""),
|
140
|
+
keywords=data.get("keywords", []),
|
141
|
+
categories=data.get("categories", []),
|
142
|
+
readme=data.get("readme", ""),
|
143
|
+
downloads=data.get("downloads", 0),
|
144
|
+
github_stars=data.get("github_stars", 0),
|
145
|
+
dependencies=data.get("dependencies", []),
|
146
|
+
features=data.get("features", {}),
|
147
|
+
code_snippets=data.get("code_snippets", []),
|
148
|
+
readme_sections=data.get("readme_sections", {}),
|
149
|
+
librs_downloads=data.get("librs_downloads"),
|
150
|
+
source=data.get("source", "crates.io"),
|
151
|
+
)
|
152
|
+
|
211
153
|
except Exception as e:
|
212
|
-
logging.error(f"Error fetching {crate_name}: {e}")
|
154
|
+
logging.error(f"Error fetching metadata for {crate_name}: {e}")
|
213
155
|
return None
|
214
156
|
|
215
|
-
# Use asyncio.gather for parallel async processing
|
216
157
|
tasks = [fetch_single_crate_safe(name) for name in crate_names]
|
217
158
|
results_raw = await asyncio.gather(*tasks)
|
218
|
-
results = [r for r in results_raw if r
|
219
|
-
|
220
|
-
|
159
|
+
results = [r for r in results_raw if r]
|
160
|
+
logging.info(
|
161
|
+
f"Fetched metadata for {len(results)} out of "
|
162
|
+
f"{len(crate_names)} requested crates."
|
163
|
+
)
|
221
164
|
return results
|
222
165
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
async def enrich_batch(
|
228
|
-
self,
|
229
|
-
batch: List[CrateMetadata]) -> List[EnrichedCrate]:
|
230
|
-
"""Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
|
231
|
-
# Add GitHub stats first
|
166
|
+
async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
|
167
|
+
"""Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
|
168
|
+
# Update GitHub stats
|
232
169
|
github_repos = [
|
233
|
-
c.repository for c in batch if "github.com" in c.repository
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
170
|
+
c.repository for c in batch if c.repository and "github.com" in c.repository
|
171
|
+
]
|
172
|
+
if github_repos:
|
173
|
+
repo_stats = self.github_client.batch_get_repo_stats(github_repos)
|
174
|
+
for crate in batch:
|
175
|
+
if crate.repository in repo_stats:
|
176
|
+
stats = repo_stats[crate.repository]
|
177
|
+
crate.github_stars = stats.get("stargazers_count", 0)
|
178
|
+
|
179
|
+
# Asynchronously enhance with scraping and AI
|
180
|
+
enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
|
181
|
+
enriched_results = await asyncio.gather(*enrichment_tasks)
|
182
|
+
return [result for result in enriched_results if result]
|
183
|
+
|
184
|
+
async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
|
185
|
+
"""Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
|
186
|
+
try:
|
187
|
+
# Enhanced scraping if available
|
188
|
+
if self.enhanced_scraper:
|
189
|
+
await self._enhance_with_scraping(crate)
|
190
|
+
|
191
|
+
# Now enrich with AI
|
192
|
+
enriched = self.enricher.enrich_crate(crate)
|
193
|
+
|
194
|
+
# Add cargo analysis if we have a local crate directory
|
195
|
+
# Note: This would require downloading/cloning the crate first
|
196
|
+
# For now, we'll add a placeholder for cargo analysis
|
197
|
+
enriched.source_analysis = {
|
198
|
+
"cargo_analysis_available": False,
|
199
|
+
"note": "Cargo analysis requires local crate source code"
|
200
|
+
}
|
201
|
+
|
202
|
+
logging.info(f"Enriched {crate.name}")
|
203
|
+
return enriched
|
204
|
+
except Exception as e:
|
205
|
+
logging.error(f"Failed to enrich {crate.name}: {e}")
|
206
|
+
# Return a partially enriched crate to avoid data loss
|
207
|
+
enriched_dict = crate.to_dict()
|
208
|
+
return EnrichedCrate(**enriched_dict)
|
209
|
+
|
210
|
+
async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
|
211
|
+
"""
|
212
|
+
Enhances a single crate with advanced web scraping data.
|
213
|
+
Modifies the crate object in place.
|
214
|
+
"""
|
215
|
+
if not self.enhanced_scraper:
|
216
|
+
return
|
275
217
|
|
218
|
+
try:
|
219
|
+
scraping_results = await self.enhanced_scraper.scrape_crate_documentation(crate.name)
|
220
|
+
if scraping_results:
|
221
|
+
self._integrate_scraping_results(crate, scraping_results)
|
276
222
|
logging.info(
|
277
|
-
f"Enhanced scraping for {crate.name}:
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
# Add enhanced scraping data
|
295
|
-
enhanced_crate.enhanced_scraping = {}
|
223
|
+
f"Enhanced scraping for {crate.name}: "
|
224
|
+
f"{len(scraping_results)} sources"
|
225
|
+
)
|
226
|
+
except Exception as e:
|
227
|
+
logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
|
228
|
+
|
229
|
+
def _integrate_scraping_results(
|
230
|
+
self,
|
231
|
+
crate: CrateMetadata,
|
232
|
+
scraping_results: "Dict[str, Any]",
|
233
|
+
) -> None:
|
234
|
+
"""
|
235
|
+
Integrates enhanced scraping results into the crate metadata.
|
236
|
+
Modifies the crate object in place.
|
237
|
+
"""
|
238
|
+
crate.enhanced_scraping = {}
|
296
239
|
|
297
240
|
for source, result in scraping_results.items():
|
298
|
-
if result.error:
|
241
|
+
if not result or result.error:
|
299
242
|
continue
|
300
243
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
}
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
logging.info(
|
314
|
-
f"Updated README for {crate.name} from {source}")
|
244
|
+
crate.enhanced_scraping[source] = {
|
245
|
+
"title": result.title,
|
246
|
+
"quality_score": result.quality_score,
|
247
|
+
"extraction_method": result.extraction_method,
|
248
|
+
"structured_data": result.structured_data,
|
249
|
+
"content_length": len(result.content),
|
250
|
+
}
|
251
|
+
# Update README if we got better content
|
252
|
+
if source == "docs_rs" and result.quality_score > 0.7:
|
253
|
+
if not crate.readme or len(result.content) > len(crate.readme):
|
254
|
+
crate.readme = result.content
|
255
|
+
logging.info(f"Updated README for {crate.name} from {source}")
|
315
256
|
|
316
257
|
# Extract additional metadata from structured data
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
|
334
|
-
"""Analyze dependencies between crates"""
|
258
|
+
structured_data = result.structured_data or {}
|
259
|
+
if "features" in structured_data and isinstance(
|
260
|
+
structured_data["features"], list
|
261
|
+
):
|
262
|
+
crate.enhanced_features = structured_data["features"]
|
263
|
+
if "dependencies" in structured_data and isinstance(
|
264
|
+
structured_data["dependencies"], list
|
265
|
+
):
|
266
|
+
crate.enhanced_dependencies = structured_data["dependencies"]
|
267
|
+
if "examples" in structured_data and isinstance(
|
268
|
+
structured_data["examples"], list
|
269
|
+
):
|
270
|
+
crate.code_snippets.extend(structured_data["examples"])
|
271
|
+
|
272
|
+
def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
|
273
|
+
"""Analyze dependencies between crates."""
|
335
274
|
return DependencyAnalyzer.analyze_dependencies(crates)
|
336
275
|
|
337
|
-
def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
|
338
|
-
"""
|
276
|
+
def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
|
277
|
+
"""Saves a processing checkpoint to a file."""
|
339
278
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
340
279
|
filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
|
341
280
|
|
342
281
|
with open(filename, "w") as f:
|
343
282
|
for item in data:
|
344
|
-
|
345
|
-
item_dict = item.__dict__.copy()
|
346
|
-
f.write(json.dumps(item_dict) + "\n")
|
347
|
-
|
348
|
-
# Save status metadata
|
349
|
-
status = {
|
350
|
-
"timestamp": timestamp,
|
351
|
-
"total_crates": len(data),
|
352
|
-
"processed_crates": sum(
|
353
|
-
1 for c in data if c.use_case is not None),
|
354
|
-
"advanced_analysis": sum(
|
355
|
-
1 for c in data if c.source_analysis is not None),
|
356
|
-
"checkpoint_file": filename}
|
357
|
-
|
358
|
-
status_file = os.path.join(
|
359
|
-
self.output_dir,
|
360
|
-
f"{prefix}_status_{timestamp}.json")
|
361
|
-
with open(status_file, "w") as f:
|
362
|
-
json.dump(status, f, indent=2)
|
283
|
+
f.write(json.dumps(item.to_dict()) + "\n")
|
363
284
|
|
364
285
|
logging.info(f"Saved checkpoint to {filename}")
|
365
286
|
return filename
|
366
287
|
|
367
288
|
def save_final_output(
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
"""Save final enriched data and analysis"""
|
289
|
+
self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
|
290
|
+
) -> None:
|
291
|
+
"""Saves the final enriched data and analysis reports."""
|
372
292
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
373
293
|
|
374
294
|
# Save main enriched data
|
375
|
-
|
376
|
-
self.output_dir,
|
377
|
-
|
378
|
-
with open(
|
295
|
+
final_output_path = os.path.join(
|
296
|
+
self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
|
297
|
+
)
|
298
|
+
with open(final_output_path, "w") as f:
|
379
299
|
for item in data:
|
380
|
-
|
381
|
-
f.write(json.dumps(item_dict) + "\n")
|
300
|
+
f.write(json.dumps(item.to_dict()) + "\n")
|
382
301
|
|
383
302
|
# Save dependency analysis
|
384
|
-
|
385
|
-
self.output_dir,
|
386
|
-
|
387
|
-
with open(
|
303
|
+
dep_file_path = os.path.join(
|
304
|
+
self.output_dir, f"dependency_analysis_{timestamp}.json"
|
305
|
+
)
|
306
|
+
with open(dep_file_path, "w") as f:
|
388
307
|
json.dump(dependency_data, f, indent=2)
|
389
308
|
|
390
|
-
# Generate summary report
|
309
|
+
# Generate and save summary report
|
310
|
+
self._generate_summary_report(data, dependency_data, timestamp)
|
311
|
+
|
312
|
+
logging.info(f"Results saved to {self.output_dir}/")
|
313
|
+
|
314
|
+
def _generate_summary_report(
|
315
|
+
self,
|
316
|
+
data: "List[EnrichedCrate]",
|
317
|
+
dependency_data: "Dict[str, Any]",
|
318
|
+
timestamp: str,
|
319
|
+
) -> None:
|
320
|
+
"""Generates a summary report of the pipeline run."""
|
391
321
|
summary = {
|
392
322
|
"total_crates": len(data),
|
393
323
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
394
|
-
"most_popular": sorted(
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
324
|
+
"most_popular": sorted(
|
325
|
+
[
|
326
|
+
{
|
327
|
+
"name": c.name,
|
328
|
+
"score": c.score or 0,
|
329
|
+
"downloads": c.downloads,
|
330
|
+
"github_stars": c.github_stars,
|
331
|
+
}
|
332
|
+
for c in data
|
333
|
+
],
|
334
|
+
key=lambda x: x.get("score", 0),
|
335
|
+
reverse=True,
|
336
|
+
)[:10],
|
337
|
+
"most_depended_upon": dependency_data.get("most_depended", [])[:10],
|
401
338
|
}
|
402
339
|
|
403
|
-
|
404
|
-
|
405
|
-
f"summary_report_{timestamp}.json")
|
406
|
-
with open(summary_file, "w") as f:
|
340
|
+
summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
|
341
|
+
with open(summary_path, "w") as f:
|
407
342
|
json.dump(summary, f, indent=2)
|
408
343
|
|
409
|
-
|
410
|
-
|
411
|
-
async def run(self):
|
412
|
-
"""Main pipeline execution flow (async)"""
|
344
|
+
async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
|
345
|
+
"""Main pipeline execution flow."""
|
413
346
|
start_time = time.time()
|
347
|
+
if not self.crates:
|
348
|
+
logging.error("No crates to process. Exiting.")
|
349
|
+
return None
|
350
|
+
|
414
351
|
logging.info(f"Processing {len(self.crates)} crates...")
|
415
352
|
|
416
|
-
|
417
|
-
|
418
|
-
crate_batches = [
|
419
|
-
|
353
|
+
all_enriched: "List[EnrichedCrate]" = []
|
354
|
+
batch_size = self.config.batch_size
|
355
|
+
crate_batches = [
|
356
|
+
self.crates[i : i + batch_size]
|
357
|
+
for i in range(0, len(self.crates), batch_size)
|
358
|
+
]
|
420
359
|
|
421
|
-
for
|
360
|
+
for i, batch_names in enumerate(crate_batches):
|
422
361
|
logging.info(
|
423
|
-
f"Processing batch {
|
424
|
-
|
425
|
-
|
426
|
-
|
362
|
+
f"Processing batch {i + 1}/{len(crate_batches)} "
|
363
|
+
f"({len(batch_names)} crates)"
|
364
|
+
)
|
365
|
+
|
366
|
+
# Fetch metadata
|
367
|
+
metadata_batch = await self.fetch_metadata_batch(batch_names)
|
368
|
+
if not metadata_batch:
|
369
|
+
logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
|
370
|
+
continue
|
427
371
|
|
428
|
-
# Enrich the batch
|
429
|
-
enriched_batch = await self.enrich_batch(
|
372
|
+
# Enrich the batch
|
373
|
+
enriched_batch = await self.enrich_batch(metadata_batch)
|
430
374
|
all_enriched.extend(enriched_batch)
|
431
375
|
|
432
|
-
# Save checkpoint
|
433
|
-
self.save_checkpoint(all_enriched, "
|
376
|
+
# Save checkpoint
|
377
|
+
self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
|
434
378
|
logging.info(
|
435
|
-
f"Completed batch {
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
try:
|
441
|
-
crate.source_analysis = SourceAnalyzer.analyze_crate_source(
|
442
|
-
crate)
|
443
|
-
crate.security = SecurityAnalyzer.check_security_metrics(
|
444
|
-
crate)
|
445
|
-
crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
|
446
|
-
crate)
|
447
|
-
logging.info(
|
448
|
-
f"Advanced analysis completed for {crate.name}")
|
449
|
-
except Exception as e:
|
450
|
-
logging.warning(
|
451
|
-
f"Advanced analysis failed for {crate.name}: {str(e)}")
|
452
|
-
|
453
|
-
# Step 3: Perform dependency analysis
|
379
|
+
f"Completed batch {i + 1}, "
|
380
|
+
f"processed {len(all_enriched)}/{len(self.crates)} crates"
|
381
|
+
)
|
382
|
+
|
383
|
+
# Final analysis and saving
|
454
384
|
logging.info("Analyzing crate dependencies...")
|
455
385
|
dependency_analysis = self.analyze_dependencies(all_enriched)
|
456
|
-
|
457
|
-
# Save final results
|
458
386
|
self.save_final_output(all_enriched, dependency_analysis)
|
459
387
|
|
460
|
-
# Final summary
|
461
388
|
duration = time.time() - start_time
|
462
|
-
logging.info(
|
463
|
-
f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
464
|
-
|
389
|
+
logging.info(f"[OK] Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
465
390
|
return all_enriched, dependency_analysis
|