rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +317 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +79 -47
- rust_crate_pipeline-1.4.2.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.2.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.2.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/pipeline.py
CHANGED
@@ -4,462 +4,404 @@ import time
|
|
4
4
|
import logging
|
5
5
|
import json
|
6
6
|
import asyncio
|
7
|
-
from typing import
|
7
|
+
from typing import Any, Union, TYPE_CHECKING
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from typing import Dict, List, Optional
|
11
|
+
|
8
12
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
9
13
|
from .network import CrateAPIClient, GitHubBatchClient
|
10
14
|
from .ai_processing import LLMEnricher
|
11
|
-
from .analysis import
|
15
|
+
from .analysis import DependencyAnalyzer
|
16
|
+
from .crate_analysis import CrateAnalyzer
|
17
|
+
|
18
|
+
# Import Azure OpenAI enricher
|
19
|
+
try:
|
20
|
+
from .azure_ai_processing import AzureOpenAIEnricher
|
21
|
+
AZURE_OPENAI_AVAILABLE = True
|
22
|
+
except ImportError:
|
23
|
+
AZURE_OPENAI_AVAILABLE = False
|
24
|
+
AzureOpenAIEnricher = None
|
12
25
|
|
13
26
|
# Import enhanced scraping capabilities
|
14
27
|
try:
|
15
|
-
import
|
16
|
-
|
17
|
-
from enhanced_scraping import CrateDocumentationScraper, EnhancedScrapingResult
|
18
|
-
enhanced_scraping_available = True
|
28
|
+
from .scraping.unified_scraper import UnifiedScraper, ScrapingResult
|
29
|
+
ENHANCED_SCRAPING_AVAILABLE = True
|
19
30
|
except ImportError:
|
20
|
-
|
21
|
-
|
22
|
-
|
31
|
+
ENHANCED_SCRAPING_AVAILABLE = False
|
32
|
+
UnifiedScraper = None # type: ignore[assignment,misc]
|
33
|
+
ScrapingResult = None # type: ignore[assignment,misc]
|
23
34
|
logging.warning("Enhanced scraping not available - using basic methods")
|
24
35
|
|
25
36
|
|
37
|
+
class CustomJSONEncoder(json.JSONEncoder):
|
38
|
+
"""Custom JSON encoder to handle non-serializable objects"""
|
39
|
+
def default(self, obj):
|
40
|
+
if hasattr(obj, 'to_dict'):
|
41
|
+
return obj.to_dict()
|
42
|
+
elif hasattr(obj, '__dict__'):
|
43
|
+
return obj.__dict__
|
44
|
+
else:
|
45
|
+
return str(obj)
|
46
|
+
|
47
|
+
|
26
48
|
class CrateDataPipeline:
|
27
|
-
|
49
|
+
"""Orchestrates the entire data collection, enrichment, and analysis pipeline."""
|
50
|
+
|
51
|
+
def __init__(self, config: PipelineConfig, crate_list: "List[str] | None" = None, **kwargs) -> None:
|
28
52
|
self.config = config
|
29
53
|
self.api_client = CrateAPIClient(config)
|
30
54
|
self.github_client = GitHubBatchClient(config)
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
self.enhanced_scraper = None
|
35
|
-
if enhanced_scraping_available and CrateDocumentationScraper is not None and hasattr(config, 'enable_crawl4ai'):
|
55
|
+
|
56
|
+
# Initialize the appropriate AI enricher based on configuration
|
57
|
+
if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
|
36
58
|
try:
|
37
|
-
self.
|
38
|
-
|
39
|
-
logging.info("✅ Enhanced scraping with Crawl4AI enabled")
|
59
|
+
self.enricher = AzureOpenAIEnricher(config)
|
60
|
+
logging.info("[OK] Using Azure OpenAI enricher")
|
40
61
|
except Exception as e:
|
41
|
-
logging.warning(
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
62
|
+
logging.warning(f"[WARN] Failed to initialize Azure OpenAI enricher: {e}")
|
63
|
+
logging.info("[INFO] Falling back to local LLM enricher")
|
64
|
+
self.enricher = LLMEnricher(config)
|
65
|
+
else:
|
66
|
+
if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
|
67
|
+
logging.warning("[WARN] Azure OpenAI requested but not available")
|
68
|
+
self.enricher = LLMEnricher(config)
|
69
|
+
logging.info("[OK] Using local LLM enricher")
|
70
|
+
|
71
|
+
# Initialize cargo analyzer
|
72
|
+
self.cargo_analyzer = CrateAnalyzer(".")
|
73
|
+
|
74
|
+
# Use provided crate_list or load from file
|
75
|
+
if crate_list:
|
76
|
+
self.crates = crate_list
|
77
|
+
logging.info(f"Using provided crate list: {len(crate_list)} crates")
|
78
|
+
else:
|
79
|
+
self.crates = self._get_crate_list()
|
80
|
+
|
81
|
+
self.output_dir = self._create_output_dir()
|
82
|
+
self.enhanced_scraper: Any = (
|
83
|
+
self._initialize_enhanced_scraper()
|
84
|
+
)
|
85
|
+
|
86
|
+
def _initialize_enhanced_scraper(self) -> Any:
|
87
|
+
"""Initializes the CrateDocumentationScraper if available and enabled."""
|
88
|
+
if (
|
89
|
+
not ENHANCED_SCRAPING_AVAILABLE
|
90
|
+
or not self.config.enable_crawl4ai
|
91
|
+
or UnifiedScraper is None
|
92
|
+
):
|
93
|
+
return None
|
94
|
+
try:
|
95
|
+
scraper = UnifiedScraper()
|
96
|
+
logging.info("[OK] Enhanced scraping with Crawl4AI enabled")
|
97
|
+
return scraper
|
98
|
+
except Exception as e:
|
99
|
+
logging.warning(f"[ERROR] Failed to initialize enhanced scraping: {e}")
|
100
|
+
return None
|
52
101
|
|
53
102
|
def _create_output_dir(self) -> str:
|
103
|
+
"""Creates a timestamped output directory for pipeline results."""
|
54
104
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
55
|
-
output_dir = f"crate_data_{timestamp}"
|
105
|
+
output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
|
56
106
|
os.makedirs(output_dir, exist_ok=True)
|
57
107
|
return output_dir
|
58
108
|
|
59
|
-
def
|
60
|
-
"""
|
61
|
-
crates
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
"
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
# Utilities and general purpose
|
86
|
-
"rand", "uuid", "itertools", "num", "cfg-i", "bytes", "mime",
|
87
|
-
"form_urlencoded", "csv", "once_cell", "base64", "flate2", "tar", "dirs",
|
88
|
-
"walkdir", "glob", "bitflags", "indexmap", "smallvec", "arrayvec", "tinyvec",
|
89
|
-
"ahash", "fxhash", "rustc-hash", "seahash", "siphasher", "wyhash", "xxhash-rust",
|
90
|
-
"getrandom", "fastrand", "nanorand", "url", "percent-encoding", "unicode-segmentation",
|
91
|
-
"unicode-normalization", "unicode-width", "memchr", "aho-corasick", "bstr",
|
92
|
-
# HTTP clients and servers
|
93
|
-
"reqwest", "hyper", "sur", "ureq", "attohttpc", "isahc", "curl", "libcurl-sys",
|
94
|
-
"http", "http-body", "httparse", "hyper-tls", "hyper-rustls", "native-tls",
|
95
|
-
"webpki", "webpki-roots",
|
96
|
-
|
97
|
-
# Database and storage
|
98
|
-
"sqlx", "diesel", "postgres", "rusqlite", "mysql", "mongodb", "redis",
|
99
|
-
"tokio-postgres", "deadpool-postgres", "bb8", "r2d2", "sea-orm", "rbatis",
|
100
|
-
"sled", "rocksdb", "lmdb", "redb", "pickledb", "persy", "heed", "fjall",
|
101
|
-
# Concurrency and parallelism
|
102
|
-
"rayon", "crossbeam", "crossbeam-channel", "crossbeam-utils", "crossbeam-epoch",
|
103
|
-
"crossbeam-deque", "parking_lot", "spin", "atomic", "arc-swap", "dashmap",
|
104
|
-
"flume", "kanal", "tokio-util", "futures-concurrency",
|
105
|
-
# Protocol buffers, gRPC, and messaging
|
106
|
-
"prost", "tonic", "protobu", "grpcio", "tarpc", "capnp", "rmp",
|
107
|
-
"zmq", "nanomsg", "nats", "rdkafka", "pulsar", "lapin", "amqp", "rumqttc",
|
108
|
-
# Procedural macros and metaprogramming
|
109
|
-
"syn", "quote", "proc-macro2", "proc-macro-crate", "proc-macro-error",
|
110
|
-
"darling", "derive_builder", "strum", "strum_macros",
|
111
|
-
"enum-iterator", "num-derive", "num-traits", "paste", "lazy_static",
|
112
|
-
|
113
|
-
# Cryptography and security
|
114
|
-
"ring", "rustls", "openssl", "sha2", "sha3", "blake2", "blake3", "md5",
|
115
|
-
"hmac", "pbkdf2", "scrypt", "argon2", "bcrypt", "chacha20poly1305",
|
116
|
-
"aes-gcm", "rsa", "ed25519-dalek", "x25519-dalek", "curve25519-dalek",
|
117
|
-
"secp256k1", "k256", "p256", "ecdsa", "signature", "rand_core",
|
118
|
-
|
119
|
-
# Game development and graphics
|
120
|
-
"bevy", "macroquad", "ggez", "piston", "winit", "wgpu", "vulkano", "glium",
|
121
|
-
"three-d", "kiss3d", "nalgebra", "cgmath", "glam", "ultraviolet", "mint",
|
122
|
-
"image", "imageproc", "resvg", "tiny-skia", "lyon", "femtovg", "skulpin",
|
123
|
-
# Networking and protocols
|
124
|
-
"socket2", "mio", "polling", "async-io", "calloop", "quinn",
|
125
|
-
"rustls-pemfile", "trust-dns", "hickory-dns", "async-h1", "h2", "h3",
|
126
|
-
"websocket", "tokio-tungstenite", "tungstenite", "ws", "warp-ws",
|
127
|
-
|
128
|
-
# Text processing and parsing
|
129
|
-
"regex", "regex-syntax", "pest", "pest_derive", "nom", "combine", "winnow",
|
130
|
-
"lalrpop", "chumsky", "logos", "lex", "yacc", "tree-sitter", "syntect",
|
131
|
-
"pulldown-cmark", "comrak", "markdown", "ammonia", "scraper", "kuchiki",
|
132
|
-
|
133
|
-
# System programming and OS interfaces
|
134
|
-
"libc", "winapi", "windows", "nix", "users", "sysinfo", "procfs", "psutil",
|
135
|
-
"notify", "inotify", "hotwatch", "signal-hook", "ctrlc", "daemonize",
|
136
|
-
"fork", "shared_memory", "memmap2", "mlock", "caps", "uzers",
|
137
|
-
# Testing and development tools
|
138
|
-
"criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
|
139
|
-
"httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
|
140
|
-
"insta", "goldenfile", "similar", "difference", "pretty_assertions",
|
141
|
-
|
142
|
-
# Configuration and environment
|
143
|
-
"config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
|
144
|
-
"etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
|
145
|
-
|
146
|
-
# Logging and observability
|
147
|
-
"log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
|
148
|
-
"tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
|
149
|
-
"log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
|
150
|
-
|
151
|
-
# Time and date
|
152
|
-
"chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
|
153
|
-
"cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
|
154
|
-
|
155
|
-
# Machine Learning & AI
|
156
|
-
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
157
|
-
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
158
|
-
"tract-nne", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
|
159
|
-
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
160
|
-
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
161
|
-
"candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
|
162
|
-
"onnxruntime", "onnxruntime-sys", "onnx-protobu", "llama-cpp-2",
|
163
|
-
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai", "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
|
164
|
-
"genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
|
165
|
-
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
166
|
-
"toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
|
167
|
-
]
|
168
|
-
|
169
|
-
if limit is not None:
|
170
|
-
return crates[:limit]
|
171
|
-
return crates
|
172
|
-
|
173
|
-
async def fetch_metadata_batch(
|
174
|
-
self,
|
175
|
-
crate_names: List[str]) -> List[CrateMetadata]:
|
176
|
-
"""Fetch metadata for a batch of crates using asyncio-based parallel processing
|
109
|
+
def _get_crate_list(self) -> "List[str]":
|
110
|
+
"""
|
111
|
+
Loads the list of crates to process from an external file.
|
112
|
+
This approach is more modular and easier to maintain than a hardcoded list.
|
113
|
+
"""
|
114
|
+
crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
|
115
|
+
try:
|
116
|
+
with open(crate_list_path) as f:
|
117
|
+
crates = [line.strip() for line in f if line.strip()]
|
118
|
+
logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
|
119
|
+
if not crates:
|
120
|
+
logging.warning(f"Crate list at {crate_list_path} is empty.")
|
121
|
+
return crates
|
122
|
+
except FileNotFoundError:
|
123
|
+
logging.error(f"Crate list file not found at: {crate_list_path}")
|
124
|
+
return []
|
125
|
+
|
126
|
+
def get_crate_list(self) -> "List[str]":
|
127
|
+
"""
|
128
|
+
Public method to get the list of crates.
|
129
|
+
Returns the already loaded crate list or loads it if not available.
|
130
|
+
"""
|
131
|
+
if hasattr(self, "crates") and self.crates:
|
132
|
+
return self.crates
|
133
|
+
else:
|
134
|
+
return self._get_crate_list()
|
177
135
|
|
178
|
-
|
179
|
-
|
180
|
-
|
136
|
+
async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
|
137
|
+
"""
|
138
|
+
Fetches metadata for a batch of crates using asyncio-based parallel processing.
|
181
139
|
"""
|
182
|
-
results = []
|
183
140
|
|
184
|
-
async def fetch_single_crate_safe(
|
141
|
+
async def fetch_single_crate_safe(
|
142
|
+
crate_name: str,
|
143
|
+
) -> Union[CrateMetadata, None]:
|
185
144
|
try:
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
145
|
+
loop = asyncio.get_running_loop()
|
146
|
+
data = await loop.run_in_executor(
|
147
|
+
None, self.api_client.fetch_crate_metadata, crate_name
|
148
|
+
)
|
149
|
+
if not data:
|
150
|
+
return None
|
151
|
+
|
152
|
+
return CrateMetadata(
|
153
|
+
name=data.get("name", ""),
|
154
|
+
version=data.get("version", ""),
|
155
|
+
description=data.get("description", ""),
|
156
|
+
repository=data.get("repository", ""),
|
157
|
+
keywords=data.get("keywords", []),
|
158
|
+
categories=data.get("categories", []),
|
159
|
+
readme=data.get("readme", ""),
|
160
|
+
downloads=data.get("downloads", 0),
|
161
|
+
github_stars=data.get("github_stars", 0),
|
162
|
+
dependencies=data.get("dependencies", []),
|
163
|
+
features=data.get("features", {}),
|
164
|
+
code_snippets=data.get("code_snippets", []),
|
165
|
+
readme_sections=data.get("readme_sections", {}),
|
166
|
+
librs_downloads=data.get("librs_downloads"),
|
167
|
+
source=data.get("source", "crates.io"),
|
168
|
+
)
|
169
|
+
|
211
170
|
except Exception as e:
|
212
|
-
logging.error(f"Error fetching {crate_name}: {e}")
|
171
|
+
logging.error(f"Error fetching metadata for {crate_name}: {e}")
|
213
172
|
return None
|
214
173
|
|
215
|
-
# Use asyncio.gather for parallel async processing
|
216
174
|
tasks = [fetch_single_crate_safe(name) for name in crate_names]
|
217
175
|
results_raw = await asyncio.gather(*tasks)
|
218
|
-
results = [r for r in results_raw if r
|
219
|
-
|
220
|
-
|
176
|
+
results = [r for r in results_raw if r]
|
177
|
+
logging.info(
|
178
|
+
f"Fetched metadata for {len(results)} out of "
|
179
|
+
f"{len(crate_names)} requested crates."
|
180
|
+
)
|
221
181
|
return results
|
222
182
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
async def enrich_batch(
|
228
|
-
self,
|
229
|
-
batch: List[CrateMetadata]) -> List[EnrichedCrate]:
|
230
|
-
"""Enrich a batch of crates with GitHub stats, enhanced scraping, and AI"""
|
231
|
-
# Add GitHub stats first
|
183
|
+
async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
|
184
|
+
"""Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
|
185
|
+
# Update GitHub stats
|
232
186
|
github_repos = [
|
233
|
-
c.repository for c in batch if "github.com" in c.repository
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
187
|
+
c.repository for c in batch if c.repository and "github.com" in c.repository
|
188
|
+
]
|
189
|
+
if github_repos:
|
190
|
+
repo_stats = self.github_client.batch_get_repo_stats(github_repos)
|
191
|
+
for crate in batch:
|
192
|
+
if crate.repository in repo_stats:
|
193
|
+
stats = repo_stats[crate.repository]
|
194
|
+
crate.github_stars = stats.get("stargazers_count", 0)
|
195
|
+
|
196
|
+
# Asynchronously enhance with scraping and AI
|
197
|
+
enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
|
198
|
+
enriched_results = await asyncio.gather(*enrichment_tasks)
|
199
|
+
return [result for result in enriched_results if result]
|
200
|
+
|
201
|
+
async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
|
202
|
+
"""Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
|
203
|
+
try:
|
204
|
+
# Enhanced scraping if available
|
205
|
+
if self.enhanced_scraper:
|
206
|
+
await self._enhance_with_scraping(crate)
|
207
|
+
|
208
|
+
# Now enrich with AI
|
209
|
+
enriched = self.enricher.enrich_crate(crate)
|
210
|
+
|
211
|
+
# Add cargo analysis if we have a local crate directory
|
212
|
+
# Note: This would require downloading/cloning the crate first
|
213
|
+
# For now, we'll add a placeholder for cargo analysis
|
214
|
+
enriched.source_analysis = {
|
215
|
+
"cargo_analysis_available": False,
|
216
|
+
"note": "Cargo analysis requires local crate source code"
|
217
|
+
}
|
218
|
+
|
219
|
+
logging.info(f"Enriched {crate.name}")
|
220
|
+
return enriched
|
221
|
+
except Exception as e:
|
222
|
+
logging.error(f"Failed to enrich {crate.name}: {e}")
|
223
|
+
# Return a partially enriched crate to avoid data loss
|
224
|
+
enriched_dict = crate.to_dict()
|
225
|
+
return EnrichedCrate(**enriched_dict)
|
226
|
+
|
227
|
+
async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
|
228
|
+
"""
|
229
|
+
Enhances a single crate with advanced web scraping data.
|
230
|
+
Modifies the crate object in place.
|
231
|
+
"""
|
232
|
+
if not self.enhanced_scraper:
|
233
|
+
return
|
275
234
|
|
235
|
+
try:
|
236
|
+
scraping_results = await self.enhanced_scraper.scrape_crate_documentation(crate.name)
|
237
|
+
if scraping_results:
|
238
|
+
self._integrate_scraping_results(crate, scraping_results)
|
276
239
|
logging.info(
|
277
|
-
f"Enhanced scraping for {crate.name}:
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
# Add enhanced scraping data
|
295
|
-
enhanced_crate.enhanced_scraping = {}
|
240
|
+
f"Enhanced scraping for {crate.name}: "
|
241
|
+
f"{len(scraping_results)} sources"
|
242
|
+
)
|
243
|
+
except Exception as e:
|
244
|
+
logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
|
245
|
+
|
246
|
+
def _integrate_scraping_results(
|
247
|
+
self,
|
248
|
+
crate: CrateMetadata,
|
249
|
+
scraping_results: "Dict[str, Any]",
|
250
|
+
) -> None:
|
251
|
+
"""
|
252
|
+
Integrates enhanced scraping results into the crate metadata.
|
253
|
+
Modifies the crate object in place.
|
254
|
+
"""
|
255
|
+
crate.enhanced_scraping = {}
|
296
256
|
|
297
257
|
for source, result in scraping_results.items():
|
298
|
-
if result.error:
|
258
|
+
if not result or result.error:
|
299
259
|
continue
|
300
260
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
}
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
logging.info(
|
314
|
-
f"Updated README for {crate.name} from {source}")
|
261
|
+
crate.enhanced_scraping[source] = {
|
262
|
+
"title": result.title,
|
263
|
+
"quality_score": result.quality_score,
|
264
|
+
"extraction_method": result.extraction_method,
|
265
|
+
"structured_data": result.structured_data,
|
266
|
+
"content_length": len(result.content),
|
267
|
+
}
|
268
|
+
# Update README if we got better content
|
269
|
+
if source == "docs_rs" and result.quality_score > 0.7:
|
270
|
+
if not crate.readme or len(result.content) > len(crate.readme):
|
271
|
+
crate.readme = result.content
|
272
|
+
logging.info(f"Updated README for {crate.name} from {source}")
|
315
273
|
|
316
274
|
# Extract additional metadata from structured data
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
|
334
|
-
"""Analyze dependencies between crates"""
|
275
|
+
structured_data = result.structured_data or {}
|
276
|
+
if "features" in structured_data and isinstance(
|
277
|
+
structured_data["features"], list
|
278
|
+
):
|
279
|
+
crate.enhanced_features = structured_data["features"]
|
280
|
+
if "dependencies" in structured_data and isinstance(
|
281
|
+
structured_data["dependencies"], list
|
282
|
+
):
|
283
|
+
crate.enhanced_dependencies = structured_data["dependencies"]
|
284
|
+
if "examples" in structured_data and isinstance(
|
285
|
+
structured_data["examples"], list
|
286
|
+
):
|
287
|
+
crate.code_snippets.extend(structured_data["examples"])
|
288
|
+
|
289
|
+
def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
|
290
|
+
"""Analyze dependencies between crates."""
|
335
291
|
return DependencyAnalyzer.analyze_dependencies(crates)
|
336
292
|
|
337
|
-
def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
|
338
|
-
"""
|
293
|
+
def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
|
294
|
+
"""Saves a processing checkpoint to a file."""
|
339
295
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
340
296
|
filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
|
341
297
|
|
342
298
|
with open(filename, "w") as f:
|
343
299
|
for item in data:
|
344
|
-
|
345
|
-
item_dict = item.__dict__.copy()
|
346
|
-
f.write(json.dumps(item_dict) + "\n")
|
347
|
-
|
348
|
-
# Save status metadata
|
349
|
-
status = {
|
350
|
-
"timestamp": timestamp,
|
351
|
-
"total_crates": len(data),
|
352
|
-
"processed_crates": sum(
|
353
|
-
1 for c in data if c.use_case is not None),
|
354
|
-
"advanced_analysis": sum(
|
355
|
-
1 for c in data if c.source_analysis is not None),
|
356
|
-
"checkpoint_file": filename}
|
357
|
-
|
358
|
-
status_file = os.path.join(
|
359
|
-
self.output_dir,
|
360
|
-
f"{prefix}_status_{timestamp}.json")
|
361
|
-
with open(status_file, "w") as f:
|
362
|
-
json.dump(status, f, indent=2)
|
300
|
+
f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
|
363
301
|
|
364
302
|
logging.info(f"Saved checkpoint to {filename}")
|
365
303
|
return filename
|
366
304
|
|
367
305
|
def save_final_output(
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
"""Save final enriched data and analysis"""
|
306
|
+
self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
|
307
|
+
) -> None:
|
308
|
+
"""Saves the final enriched data and analysis reports."""
|
372
309
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
373
310
|
|
374
311
|
# Save main enriched data
|
375
|
-
|
376
|
-
self.output_dir,
|
377
|
-
|
378
|
-
with open(
|
312
|
+
final_output_path = os.path.join(
|
313
|
+
self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
|
314
|
+
)
|
315
|
+
with open(final_output_path, "w") as f:
|
379
316
|
for item in data:
|
380
|
-
|
381
|
-
f.write(json.dumps(item_dict) + "\n")
|
317
|
+
f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
|
382
318
|
|
383
319
|
# Save dependency analysis
|
384
|
-
|
385
|
-
self.output_dir,
|
386
|
-
|
387
|
-
with open(
|
320
|
+
dep_file_path = os.path.join(
|
321
|
+
self.output_dir, f"dependency_analysis_{timestamp}.json"
|
322
|
+
)
|
323
|
+
with open(dep_file_path, "w") as f:
|
388
324
|
json.dump(dependency_data, f, indent=2)
|
389
325
|
|
390
|
-
# Generate summary report
|
326
|
+
# Generate and save summary report
|
327
|
+
self._generate_summary_report(data, dependency_data, timestamp)
|
328
|
+
|
329
|
+
logging.info(f"Results saved to {self.output_dir}/")
|
330
|
+
|
331
|
+
def _generate_summary_report(
|
332
|
+
self,
|
333
|
+
data: "List[EnrichedCrate]",
|
334
|
+
dependency_data: "Dict[str, Any]",
|
335
|
+
timestamp: str,
|
336
|
+
) -> None:
|
337
|
+
"""Generates a summary report of the pipeline run."""
|
391
338
|
summary = {
|
392
339
|
"total_crates": len(data),
|
393
340
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
394
|
-
"most_popular": sorted(
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
341
|
+
"most_popular": sorted(
|
342
|
+
[
|
343
|
+
{
|
344
|
+
"name": c.name,
|
345
|
+
"score": c.score or 0,
|
346
|
+
"downloads": c.downloads,
|
347
|
+
"github_stars": c.github_stars,
|
348
|
+
}
|
349
|
+
for c in data
|
350
|
+
],
|
351
|
+
key=lambda x: x.get("score", 0),
|
352
|
+
reverse=True,
|
353
|
+
)[:10],
|
354
|
+
"most_depended_upon": dependency_data.get("most_depended", [])[:10],
|
401
355
|
}
|
402
356
|
|
403
|
-
|
404
|
-
|
405
|
-
f"summary_report_{timestamp}.json")
|
406
|
-
with open(summary_file, "w") as f:
|
357
|
+
summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
|
358
|
+
with open(summary_path, "w") as f:
|
407
359
|
json.dump(summary, f, indent=2)
|
408
360
|
|
409
|
-
|
410
|
-
|
411
|
-
async def run(self):
|
412
|
-
"""Main pipeline execution flow (async)"""
|
361
|
+
async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
|
362
|
+
"""Main pipeline execution flow."""
|
413
363
|
start_time = time.time()
|
364
|
+
if not self.crates:
|
365
|
+
logging.error("No crates to process. Exiting.")
|
366
|
+
return None
|
367
|
+
|
414
368
|
logging.info(f"Processing {len(self.crates)} crates...")
|
415
369
|
|
416
|
-
|
417
|
-
|
418
|
-
crate_batches = [
|
419
|
-
|
370
|
+
all_enriched: "List[EnrichedCrate]" = []
|
371
|
+
batch_size = self.config.batch_size
|
372
|
+
crate_batches = [
|
373
|
+
self.crates[i : i + batch_size]
|
374
|
+
for i in range(0, len(self.crates), batch_size)
|
375
|
+
]
|
420
376
|
|
421
|
-
for
|
377
|
+
for i, batch_names in enumerate(crate_batches):
|
422
378
|
logging.info(
|
423
|
-
f"Processing batch {
|
424
|
-
|
425
|
-
|
426
|
-
|
379
|
+
f"Processing batch {i + 1}/{len(crate_batches)} "
|
380
|
+
f"({len(batch_names)} crates)"
|
381
|
+
)
|
382
|
+
|
383
|
+
# Fetch metadata
|
384
|
+
metadata_batch = await self.fetch_metadata_batch(batch_names)
|
385
|
+
if not metadata_batch:
|
386
|
+
logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
|
387
|
+
continue
|
427
388
|
|
428
|
-
# Enrich the batch
|
429
|
-
enriched_batch = await self.enrich_batch(
|
389
|
+
# Enrich the batch
|
390
|
+
enriched_batch = await self.enrich_batch(metadata_batch)
|
430
391
|
all_enriched.extend(enriched_batch)
|
431
392
|
|
432
|
-
# Save checkpoint
|
433
|
-
self.save_checkpoint(all_enriched, "
|
393
|
+
# Save checkpoint
|
394
|
+
self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
|
434
395
|
logging.info(
|
435
|
-
f"Completed batch {
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
try:
|
441
|
-
crate.source_analysis = SourceAnalyzer.analyze_crate_source(
|
442
|
-
crate)
|
443
|
-
crate.security = SecurityAnalyzer.check_security_metrics(
|
444
|
-
crate)
|
445
|
-
crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(
|
446
|
-
crate)
|
447
|
-
logging.info(
|
448
|
-
f"Advanced analysis completed for {crate.name}")
|
449
|
-
except Exception as e:
|
450
|
-
logging.warning(
|
451
|
-
f"Advanced analysis failed for {crate.name}: {str(e)}")
|
452
|
-
|
453
|
-
# Step 3: Perform dependency analysis
|
396
|
+
f"Completed batch {i + 1}, "
|
397
|
+
f"processed {len(all_enriched)}/{len(self.crates)} crates"
|
398
|
+
)
|
399
|
+
|
400
|
+
# Final analysis and saving
|
454
401
|
logging.info("Analyzing crate dependencies...")
|
455
402
|
dependency_analysis = self.analyze_dependencies(all_enriched)
|
456
|
-
|
457
|
-
# Save final results
|
458
403
|
self.save_final_output(all_enriched, dependency_analysis)
|
459
404
|
|
460
|
-
# Final summary
|
461
405
|
duration = time.time() - start_time
|
462
|
-
logging.info(
|
463
|
-
f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
464
|
-
|
406
|
+
logging.info(f"[OK] Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
465
407
|
return all_enriched, dependency_analysis
|