rust-crate-pipeline 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +52 -0
- rust_crate_pipeline/__main__.py +6 -0
- rust_crate_pipeline/ai_processing.py +396 -0
- rust_crate_pipeline/analysis.py +435 -0
- rust_crate_pipeline/config.py +46 -0
- rust_crate_pipeline/main.py +177 -0
- rust_crate_pipeline/network.py +307 -0
- rust_crate_pipeline/pipeline.py +260 -0
- rust_crate_pipeline/utils/file_utils.py +72 -0
- rust_crate_pipeline/utils/logging_utils.py +66 -0
- rust_crate_pipeline/version.py +13 -0
- rust_crate_pipeline-1.1.0.dist-info/METADATA +473 -0
- rust_crate_pipeline-1.1.0.dist-info/RECORD +17 -0
- rust_crate_pipeline-1.1.0.dist-info/WHEEL +5 -0
- rust_crate_pipeline-1.1.0.dist-info/entry_points.txt +2 -0
- rust_crate_pipeline-1.1.0.dist-info/licenses/LICENSE +21 -0
- rust_crate_pipeline-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,307 @@
|
|
1
|
+
# network.py
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
import time
|
5
|
+
import logging
|
6
|
+
import requests
|
7
|
+
from requests_cache import CachedSession
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
from typing import Dict, List, Optional
|
10
|
+
from .config import PipelineConfig
|
11
|
+
|
12
|
+
class GitHubBatchClient:
|
13
|
+
def __init__(self, config: PipelineConfig):
|
14
|
+
self.config = config
|
15
|
+
self.headers = {"Accept": "application/vnd.github.v3+json"}
|
16
|
+
if config.github_token:
|
17
|
+
self.headers["Authorization"] = f"token {config.github_token}"
|
18
|
+
|
19
|
+
self.session = CachedSession(
|
20
|
+
'github_cache',
|
21
|
+
expire_after=config.cache_ttl * 2 # Longer cache for GitHub
|
22
|
+
)
|
23
|
+
self.remaining_calls = 5000
|
24
|
+
self.reset_time = 0
|
25
|
+
|
26
|
+
def check_rate_limit(self):
|
27
|
+
"""Check and update current rate limit status"""
|
28
|
+
try:
|
29
|
+
response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
|
30
|
+
if response.ok:
|
31
|
+
data = response.json()
|
32
|
+
self.remaining_calls = data["resources"]["core"]["remaining"]
|
33
|
+
self.reset_time = data["resources"]["core"]["reset"]
|
34
|
+
|
35
|
+
if self.remaining_calls < 100:
|
36
|
+
reset_in = self.reset_time - time.time()
|
37
|
+
logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
|
38
|
+
except Exception:
|
39
|
+
pass
|
40
|
+
|
41
|
+
def get_repo_stats(self, owner: str, repo: str) -> Dict:
|
42
|
+
"""Get repository statistics"""
|
43
|
+
try:
|
44
|
+
url = f"https://api.github.com/repos/{owner}/{repo}"
|
45
|
+
response = self.session.get(url, headers=self.headers)
|
46
|
+
if response.ok:
|
47
|
+
return response.json()
|
48
|
+
else:
|
49
|
+
logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
|
50
|
+
return {}
|
51
|
+
except Exception as e:
|
52
|
+
logging.error(f"Error fetching repo stats: {str(e)}")
|
53
|
+
return {}
|
54
|
+
|
55
|
+
def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
|
56
|
+
"""Get statistics for multiple repositories in a batch"""
|
57
|
+
self.check_rate_limit()
|
58
|
+
|
59
|
+
results = {}
|
60
|
+
for repo_url in repo_list:
|
61
|
+
# Extract owner/repo from URL
|
62
|
+
match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
|
63
|
+
if not match:
|
64
|
+
continue
|
65
|
+
|
66
|
+
owner, repo = match.groups()
|
67
|
+
repo = repo.split('.')[0] # Remove .git extension if present
|
68
|
+
|
69
|
+
# Get stats
|
70
|
+
stats = self.get_repo_stats(owner, repo)
|
71
|
+
results[repo_url] = stats
|
72
|
+
|
73
|
+
# Be nice to GitHub API
|
74
|
+
time.sleep(0.1)
|
75
|
+
|
76
|
+
return results
|
77
|
+
|
78
|
+
class CrateAPIClient:
|
79
|
+
def __init__(self, config: PipelineConfig):
|
80
|
+
self.config = config
|
81
|
+
self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
|
82
|
+
|
83
|
+
def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
|
84
|
+
"""Fetch metadata with retry logic"""
|
85
|
+
for attempt in range(self.config.max_retries):
|
86
|
+
try:
|
87
|
+
return self._fetch_metadata(crate_name)
|
88
|
+
except Exception as e:
|
89
|
+
logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
|
90
|
+
wait = 2 ** attempt
|
91
|
+
time.sleep(wait)
|
92
|
+
return None
|
93
|
+
|
94
|
+
def _fetch_metadata(self, crate_name: str) -> Optional[Dict]:
|
95
|
+
"""Enhanced metadata fetching that tries multiple sources"""
|
96
|
+
# First try crates.io (primary source)
|
97
|
+
try:
|
98
|
+
r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
|
99
|
+
if r.ok:
|
100
|
+
data = r.json()
|
101
|
+
crate_data = data["crate"]
|
102
|
+
latest = crate_data["newest_version"]
|
103
|
+
|
104
|
+
# Get readme
|
105
|
+
readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
|
106
|
+
readme = readme_response.text if readme_response.ok else ""
|
107
|
+
|
108
|
+
# Get dependencies
|
109
|
+
deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
|
110
|
+
deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
|
111
|
+
|
112
|
+
# Get features - using the versions endpoint
|
113
|
+
features = []
|
114
|
+
versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
|
115
|
+
if versions_response.ok:
|
116
|
+
version_data = versions_response.json().get("version", {})
|
117
|
+
features_dict = version_data.get("features", {})
|
118
|
+
features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
|
119
|
+
|
120
|
+
# Repository info and GitHub stars
|
121
|
+
repo = crate_data.get("repository", "")
|
122
|
+
gh_stars = 0
|
123
|
+
|
124
|
+
# Check if it's a GitHub repo
|
125
|
+
if "github.com" in repo and self.config.github_token:
|
126
|
+
match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
|
127
|
+
if match:
|
128
|
+
owner, repo_name = match.groups()
|
129
|
+
repo_name = repo_name.split('.')[0] # Handle .git extensions
|
130
|
+
gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
131
|
+
gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
|
132
|
+
gh = self.session.get(gh_url, headers=gh_headers)
|
133
|
+
if gh.ok:
|
134
|
+
gh_data = gh.json()
|
135
|
+
gh_stars = gh_data.get("stargazers_count", 0)
|
136
|
+
|
137
|
+
# Check if it's hosted on lib.rs
|
138
|
+
lib_rs_data = {}
|
139
|
+
if "lib.rs" in repo:
|
140
|
+
lib_rs_url = f"https://lib.rs/crates/{crate_name}"
|
141
|
+
lib_rs_response = self.session.get(lib_rs_url)
|
142
|
+
if lib_rs_response.ok:
|
143
|
+
soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
|
144
|
+
# Get README from lib.rs if not already available
|
145
|
+
if not readme:
|
146
|
+
readme_div = soup.find('div', class_='readme')
|
147
|
+
if readme_div:
|
148
|
+
readme = readme_div.get_text(strip=True)
|
149
|
+
|
150
|
+
# Get lib.rs specific stats
|
151
|
+
stats_div = soup.find('div', class_='crate-stats')
|
152
|
+
if stats_div:
|
153
|
+
downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
|
154
|
+
if downloads_text:
|
155
|
+
lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
|
156
|
+
|
157
|
+
# Extract code snippets from readme
|
158
|
+
code_snippets = self.extract_code_snippets(readme)
|
159
|
+
|
160
|
+
# Extract sections from readme
|
161
|
+
readme_sections = self.extract_readme_sections(readme) if readme else {}
|
162
|
+
|
163
|
+
result = {
|
164
|
+
"name": crate_name,
|
165
|
+
"version": latest,
|
166
|
+
"description": crate_data.get("description", ""),
|
167
|
+
"repository": repo,
|
168
|
+
"keywords": crate_data.get("keywords", []),
|
169
|
+
"categories": crate_data.get("categories", []),
|
170
|
+
"readme": readme,
|
171
|
+
"downloads": crate_data.get("downloads", 0),
|
172
|
+
"github_stars": gh_stars,
|
173
|
+
"dependencies": deps,
|
174
|
+
"code_snippets": code_snippets,
|
175
|
+
"features": features,
|
176
|
+
"readme_sections": readme_sections,
|
177
|
+
**lib_rs_data
|
178
|
+
}
|
179
|
+
|
180
|
+
return result
|
181
|
+
|
182
|
+
except Exception as e:
|
183
|
+
logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
|
184
|
+
raise
|
185
|
+
|
186
|
+
# If crates.io fails, try lib.rs
|
187
|
+
try:
|
188
|
+
r = self.session.get(f"https://lib.rs/crates/{crate_name}")
|
189
|
+
if r.ok:
|
190
|
+
soup = BeautifulSoup(r.text, 'html.parser')
|
191
|
+
|
192
|
+
# Extract metadata from lib.rs page
|
193
|
+
name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
|
194
|
+
|
195
|
+
# Find description
|
196
|
+
desc_elem = soup.select_one('.description')
|
197
|
+
description = desc_elem.text.strip() if desc_elem else ""
|
198
|
+
|
199
|
+
# Find repository link
|
200
|
+
repo_link = None
|
201
|
+
for a in soup.select('a'):
|
202
|
+
if 'github.com' in a.get('href', ''):
|
203
|
+
repo_link = a['href']
|
204
|
+
break
|
205
|
+
|
206
|
+
# Basic metadata from lib.rs
|
207
|
+
return {
|
208
|
+
"name": name,
|
209
|
+
"version": "latest", # lib.rs doesn't easily expose version
|
210
|
+
"description": description,
|
211
|
+
"repository": repo_link or "",
|
212
|
+
"keywords": [],
|
213
|
+
"categories": [],
|
214
|
+
"readme": "",
|
215
|
+
"downloads": 0,
|
216
|
+
"github_stars": 0,
|
217
|
+
"dependencies": [],
|
218
|
+
"code_snippets": [],
|
219
|
+
"features": [],
|
220
|
+
"readme_sections": {},
|
221
|
+
"source": "lib.rs",
|
222
|
+
}
|
223
|
+
except Exception:
|
224
|
+
pass
|
225
|
+
|
226
|
+
# Finally, try GitHub search
|
227
|
+
try:
|
228
|
+
# This is a simplification - GitHub's search API requires authentication
|
229
|
+
headers = {}
|
230
|
+
if self.config.github_token:
|
231
|
+
headers["Authorization"] = f"token {self.config.github_token}"
|
232
|
+
|
233
|
+
search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
|
234
|
+
r = requests.get(search_url, headers=headers)
|
235
|
+
|
236
|
+
if r.ok:
|
237
|
+
results = r.json().get("items", [])
|
238
|
+
if results:
|
239
|
+
repo = results[0] # Take first match
|
240
|
+
|
241
|
+
# Basic metadata from GitHub
|
242
|
+
return {
|
243
|
+
"name": crate_name,
|
244
|
+
"version": "unknown",
|
245
|
+
"description": repo.get("description", ""),
|
246
|
+
"repository": repo.get("html_url", ""),
|
247
|
+
"keywords": [],
|
248
|
+
"categories": [],
|
249
|
+
"readme": "",
|
250
|
+
"downloads": 0,
|
251
|
+
"github_stars": repo.get("stargazers_count", 0),
|
252
|
+
"dependencies": [],
|
253
|
+
"code_snippets": [],
|
254
|
+
"features": [],
|
255
|
+
"readme_sections": {},
|
256
|
+
"source": "github",
|
257
|
+
}
|
258
|
+
except Exception:
|
259
|
+
pass
|
260
|
+
|
261
|
+
# If all sources fail
|
262
|
+
return None
|
263
|
+
|
264
|
+
def extract_code_snippets(self, readme: str) -> List[str]:
|
265
|
+
"""Extract code snippets from markdown README"""
|
266
|
+
snippets = []
|
267
|
+
if not readme:
|
268
|
+
return snippets
|
269
|
+
|
270
|
+
# Find Rust code blocks
|
271
|
+
pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
|
272
|
+
matches = re.findall(pattern, readme)
|
273
|
+
|
274
|
+
for code in matches:
|
275
|
+
if len(code.strip()) > 10: # Only include non-trivial snippets
|
276
|
+
snippets.append(code.strip())
|
277
|
+
|
278
|
+
return snippets[:5] # Limit to 5 snippets
|
279
|
+
|
280
|
+
def extract_readme_sections(self, readme: str) -> Dict[str, str]:
|
281
|
+
"""Extract sections from README based on markdown headers"""
|
282
|
+
if not readme:
|
283
|
+
return {}
|
284
|
+
|
285
|
+
sections = {}
|
286
|
+
lines = readme.split('\n')
|
287
|
+
current_section = ""
|
288
|
+
current_content = []
|
289
|
+
|
290
|
+
for line in lines:
|
291
|
+
if re.match(r'^#+\s+', line): # It's a header
|
292
|
+
# Save previous section
|
293
|
+
if current_section and current_content:
|
294
|
+
sections[current_section] = '\n'.join(current_content).strip()
|
295
|
+
|
296
|
+
# Start new section
|
297
|
+
current_section = re.sub(r'^#+\s+', '', line).strip()
|
298
|
+
current_content = []
|
299
|
+
else:
|
300
|
+
if current_section: # Only collect content if we have a section
|
301
|
+
current_content.append(line)
|
302
|
+
|
303
|
+
# Don't forget the last section
|
304
|
+
if current_section and current_content:
|
305
|
+
sections[current_section] = '\n'.join(current_content).strip()
|
306
|
+
|
307
|
+
return sections
|
@@ -0,0 +1,260 @@
|
|
1
|
+
# pipeline.py
|
2
|
+
import os
|
3
|
+
import time
|
4
|
+
import logging
|
5
|
+
import json
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
7
|
+
from tqdm import tqdm
|
8
|
+
from typing import List, Dict, Optional
|
9
|
+
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
10
|
+
from .network import CrateAPIClient, GitHubBatchClient
|
11
|
+
from .ai_processing import LLMEnricher
|
12
|
+
from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
|
13
|
+
|
14
|
+
class CrateDataPipeline:
|
15
|
+
def __init__(self, config: PipelineConfig):
|
16
|
+
self.config = config
|
17
|
+
self.api_client = CrateAPIClient(config)
|
18
|
+
self.github_client = GitHubBatchClient(config)
|
19
|
+
self.enricher = LLMEnricher(config)
|
20
|
+
self.crates = self.get_crate_list()
|
21
|
+
self.output_dir = self._create_output_dir()
|
22
|
+
|
23
|
+
def _create_output_dir(self) -> str:
|
24
|
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
25
|
+
output_dir = f"crate_data_{timestamp}"
|
26
|
+
os.makedirs(output_dir, exist_ok=True)
|
27
|
+
return output_dir
|
28
|
+
|
29
|
+
def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
|
30
|
+
"""Return a comprehensive list of all high-value crates to process"""
|
31
|
+
crates = [
|
32
|
+
# Web frameworks
|
33
|
+
"actix-web", "rocket", "axum", "warp", "tower",
|
34
|
+
|
35
|
+
# Async runtimes and utilities
|
36
|
+
"tokio", "tokio-stream", "async-trait", "futures",
|
37
|
+
|
38
|
+
# Serialization/deserialization
|
39
|
+
"serde", "serde_json", "serde_yaml", "bincode",
|
40
|
+
|
41
|
+
# Error handling
|
42
|
+
"anyhow", "thiserror",
|
43
|
+
|
44
|
+
# Utilities
|
45
|
+
"rand", "uuid", "chrono", "regex", "log", "env_logger", "clap", "crossterm",
|
46
|
+
"itertools", "num", "cfg-if", "bytes", "mime", "form_urlencoded", "parking_lot",
|
47
|
+
"csv", "lazy_static", "once_cell", "tracing", "base64", "sha2", "flate2", "tar",
|
48
|
+
|
49
|
+
# HTTP clients and servers
|
50
|
+
"reqwest", "hyper",
|
51
|
+
|
52
|
+
# Database
|
53
|
+
"sqlx", "diesel", "postgres", "rusqlite",
|
54
|
+
|
55
|
+
# Concurrency
|
56
|
+
"rayon",
|
57
|
+
|
58
|
+
# Protocol buffers and gRPC
|
59
|
+
"prost", "tonic",
|
60
|
+
|
61
|
+
# Procedural macros
|
62
|
+
"syn", "quote", "proc-macro2",
|
63
|
+
|
64
|
+
# Machine Learning & AI
|
65
|
+
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
66
|
+
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
67
|
+
"tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
|
68
|
+
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
69
|
+
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
70
|
+
"candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
|
71
|
+
"onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
|
72
|
+
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
|
73
|
+
"llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
|
74
|
+
"genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
|
75
|
+
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
76
|
+
"toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
|
77
|
+
]
|
78
|
+
|
79
|
+
if limit is not None:
|
80
|
+
return crates[:limit]
|
81
|
+
return crates
|
82
|
+
|
83
|
+
def fetch_metadata_batch(self, crate_names: List[str]) -> List[CrateMetadata]:
|
84
|
+
"""Fetch metadata for a batch of crates in parallel"""
|
85
|
+
with ThreadPoolExecutor(max_workers=self.config.n_workers) as executor:
|
86
|
+
futures = {executor.submit(self.api_client.fetch_crate_metadata, name): name
|
87
|
+
for name in crate_names}
|
88
|
+
|
89
|
+
results = []
|
90
|
+
for future in as_completed(futures):
|
91
|
+
crate_name = futures[future]
|
92
|
+
try:
|
93
|
+
data = future.result()
|
94
|
+
if data:
|
95
|
+
# Convert dict to CrateMetadata
|
96
|
+
crate_metadata = CrateMetadata(
|
97
|
+
name=data.get("name", ""),
|
98
|
+
version=data.get("version", ""),
|
99
|
+
description=data.get("description", ""),
|
100
|
+
repository=data.get("repository", ""),
|
101
|
+
keywords=data.get("keywords", []),
|
102
|
+
categories=data.get("categories", []),
|
103
|
+
readme=data.get("readme", ""),
|
104
|
+
downloads=data.get("downloads", 0),
|
105
|
+
github_stars=data.get("github_stars", 0),
|
106
|
+
dependencies=data.get("dependencies", []),
|
107
|
+
features=data.get("features", []),
|
108
|
+
code_snippets=data.get("code_snippets", []),
|
109
|
+
readme_sections=data.get("readme_sections", {}),
|
110
|
+
librs_downloads=data.get("librs_downloads"),
|
111
|
+
source=data.get("source", "crates.io")
|
112
|
+
)
|
113
|
+
results.append(crate_metadata)
|
114
|
+
logging.info(f"Fetched metadata for {crate_name}")
|
115
|
+
except Exception as e:
|
116
|
+
logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
|
117
|
+
|
118
|
+
return results
|
119
|
+
|
120
|
+
def enrich_batch(self, batch: List[CrateMetadata]) -> List[EnrichedCrate]:
|
121
|
+
"""Enrich a batch of crates with GitHub stats and AI"""
|
122
|
+
# Add GitHub stats first
|
123
|
+
github_repos = [c.repository for c in batch if "github.com" in c.repository]
|
124
|
+
repo_stats = self.github_client.batch_get_repo_stats(github_repos)
|
125
|
+
|
126
|
+
# Update crates with GitHub info
|
127
|
+
for crate in batch:
|
128
|
+
repo_url = crate.repository
|
129
|
+
if repo_url in repo_stats:
|
130
|
+
stats = repo_stats[repo_url]
|
131
|
+
crate.github_stars = stats.get("stargazers_count", 0)
|
132
|
+
|
133
|
+
# Now enrich with AI
|
134
|
+
enriched_batch = []
|
135
|
+
for crate in batch:
|
136
|
+
try:
|
137
|
+
enriched = self.enricher.enrich_crate(crate)
|
138
|
+
enriched_batch.append(enriched)
|
139
|
+
logging.info(f"Enriched {crate.name}")
|
140
|
+
except Exception as e:
|
141
|
+
logging.error(f"Failed to enrich {crate.name}: {str(e)}")
|
142
|
+
# Add the crate with just the fields we have
|
143
|
+
enriched_dict = crate.__dict__.copy()
|
144
|
+
enriched_batch.append(EnrichedCrate(**enriched_dict))
|
145
|
+
|
146
|
+
return enriched_batch
|
147
|
+
|
148
|
+
def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
|
149
|
+
"""Analyze dependencies between crates"""
|
150
|
+
return DependencyAnalyzer.analyze_dependencies(crates)
|
151
|
+
|
152
|
+
def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
|
153
|
+
"""Save processing checkpoint with status metadata"""
|
154
|
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
155
|
+
filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
|
156
|
+
|
157
|
+
with open(filename, "w") as f:
|
158
|
+
for item in data:
|
159
|
+
# Convert to dict for serialization
|
160
|
+
item_dict = item.__dict__.copy()
|
161
|
+
f.write(json.dumps(item_dict) + "\n")
|
162
|
+
|
163
|
+
# Save status metadata
|
164
|
+
status = {
|
165
|
+
"timestamp": timestamp,
|
166
|
+
"total_crates": len(data),
|
167
|
+
"processed_crates": sum(1 for c in data if c.use_case is not None),
|
168
|
+
"advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
|
169
|
+
"checkpoint_file": filename
|
170
|
+
}
|
171
|
+
|
172
|
+
status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
|
173
|
+
with open(status_file, "w") as f:
|
174
|
+
json.dump(status, f, indent=2)
|
175
|
+
|
176
|
+
logging.info(f"Saved checkpoint to {filename}")
|
177
|
+
return filename
|
178
|
+
|
179
|
+
def save_final_output(self, data: List[EnrichedCrate], dependency_data: Dict):
|
180
|
+
"""Save final enriched data and analysis"""
|
181
|
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
182
|
+
|
183
|
+
# Save main enriched data
|
184
|
+
final_output = os.path.join(self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl")
|
185
|
+
with open(final_output, "w") as f:
|
186
|
+
for item in data:
|
187
|
+
item_dict = item.__dict__.copy()
|
188
|
+
f.write(json.dumps(item_dict) + "\n")
|
189
|
+
|
190
|
+
# Save dependency analysis
|
191
|
+
dep_file = os.path.join(self.output_dir, f"dependency_analysis_{timestamp}.json")
|
192
|
+
with open(dep_file, "w") as f:
|
193
|
+
json.dump(dependency_data, f, indent=2)
|
194
|
+
|
195
|
+
# Generate summary report
|
196
|
+
summary = {
|
197
|
+
"total_crates": len(data),
|
198
|
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
199
|
+
"most_popular": sorted([{
|
200
|
+
"name": c.name,
|
201
|
+
"score": c.score or 0,
|
202
|
+
"downloads": c.downloads,
|
203
|
+
"github_stars": c.github_stars
|
204
|
+
} for c in data], key=lambda x: x["score"], reverse=True)[:5],
|
205
|
+
"most_depended_upon": dependency_data.get("most_depended", [])[:5]
|
206
|
+
}
|
207
|
+
|
208
|
+
summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
|
209
|
+
with open(summary_file, "w") as f:
|
210
|
+
json.dump(summary, f, indent=2)
|
211
|
+
|
212
|
+
logging.info(f"Results saved to {self.output_dir}/")
|
213
|
+
|
214
|
+
def run(self):
|
215
|
+
"""Main pipeline execution flow"""
|
216
|
+
start_time = time.time()
|
217
|
+
logging.info(f"Processing {len(self.crates)} crates...")
|
218
|
+
|
219
|
+
# Process in batches
|
220
|
+
all_enriched = []
|
221
|
+
crate_batches = [self.crates[i:i+self.config.batch_size]
|
222
|
+
for i in range(0, len(self.crates), self.config.batch_size)]
|
223
|
+
|
224
|
+
for batch_num, batch in enumerate(crate_batches):
|
225
|
+
logging.info(f"Processing batch {batch_num+1}/{len(crate_batches)} ({len(batch)} crates)")
|
226
|
+
|
227
|
+
# Fetch metadata
|
228
|
+
batch_data = self.fetch_metadata_batch(batch)
|
229
|
+
|
230
|
+
# Enrich the batch
|
231
|
+
enriched_batch = self.enrich_batch(batch_data)
|
232
|
+
all_enriched.extend(enriched_batch)
|
233
|
+
|
234
|
+
# Save checkpoint after each batch
|
235
|
+
self.save_checkpoint(all_enriched, "batch_checkpoint")
|
236
|
+
logging.info(f"Completed batch {batch_num+1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
|
237
|
+
|
238
|
+
# Optional: Add source analysis for some crates
|
239
|
+
if batch_num < 2: # Only do detailed analysis for first 2 batches
|
240
|
+
for crate in enriched_batch:
|
241
|
+
try:
|
242
|
+
crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
|
243
|
+
crate.security = SecurityAnalyzer.check_security_metrics(crate)
|
244
|
+
crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
|
245
|
+
logging.info(f"Advanced analysis completed for {crate.name}")
|
246
|
+
except Exception as e:
|
247
|
+
logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
|
248
|
+
|
249
|
+
# Step 3: Perform dependency analysis
|
250
|
+
logging.info("Analyzing crate dependencies...")
|
251
|
+
dependency_analysis = self.analyze_dependencies(all_enriched)
|
252
|
+
|
253
|
+
# Save final results
|
254
|
+
self.save_final_output(all_enriched, dependency_analysis)
|
255
|
+
|
256
|
+
# Final summary
|
257
|
+
duration = time.time() - start_time
|
258
|
+
logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
259
|
+
|
260
|
+
return all_enriched, dependency_analysis
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# rust_crate_pipeline/utils/file_utils.py
|
2
|
+
import os
|
3
|
+
import json
|
4
|
+
import shutil
|
5
|
+
from datetime import datetime
|
6
|
+
from typing import List, Dict, Any
|
7
|
+
|
8
|
+
def create_output_dir(base_name: str = "crate_data") -> str:
|
9
|
+
"""
|
10
|
+
Create timestamped output directory
|
11
|
+
|
12
|
+
Args:
|
13
|
+
base_name: Base name for output directory
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
Path to created directory
|
17
|
+
"""
|
18
|
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
19
|
+
output_dir = f"{base_name}_{timestamp}"
|
20
|
+
os.makedirs(output_dir, exist_ok=True)
|
21
|
+
return output_dir
|
22
|
+
|
23
|
+
def save_checkpoint(data: List[Dict], prefix: str, output_dir: str) -> str:
|
24
|
+
"""
|
25
|
+
Save processing checkpoint with status metadata
|
26
|
+
|
27
|
+
Args:
|
28
|
+
data: List of crate dictionaries
|
29
|
+
prefix: File name prefix
|
30
|
+
output_dir: Target directory
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Path to saved checkpoint file
|
34
|
+
"""
|
35
|
+
timestamp = datetime.now().isoformat()
|
36
|
+
filename = os.path.join(output_dir, f"{prefix}_{timestamp}.jsonl")
|
37
|
+
|
38
|
+
with open(filename, "w") as f:
|
39
|
+
for item in data:
|
40
|
+
f.write(json.dumps(item) + "\n")
|
41
|
+
|
42
|
+
# Save status metadata
|
43
|
+
status = {
|
44
|
+
"timestamp": timestamp,
|
45
|
+
"total_items": len(data),
|
46
|
+
"checkpoint_file": filename
|
47
|
+
}
|
48
|
+
|
49
|
+
status_file = os.path.join(output_dir, f"{prefix}_status_{timestamp}.json")
|
50
|
+
with open(status_file, "w") as f:
|
51
|
+
json.dump(status, f, indent=2)
|
52
|
+
|
53
|
+
return filename
|
54
|
+
|
55
|
+
def safe_file_cleanup(path: str):
|
56
|
+
"""Safely remove files or directories"""
|
57
|
+
try:
|
58
|
+
if os.path.isfile(path):
|
59
|
+
os.remove(path)
|
60
|
+
elif os.path.isdir(path):
|
61
|
+
shutil.rmtree(path)
|
62
|
+
except Exception as e:
|
63
|
+
print(f"Failed to cleanup {path}: {str(e)}")
|
64
|
+
|
65
|
+
def disk_space_check(min_free_gb: float = 1.0) -> bool:
|
66
|
+
"""Check if sufficient disk space is available"""
|
67
|
+
try:
|
68
|
+
free_bytes = shutil.disk_usage(".").free
|
69
|
+
free_gb = free_bytes / (1024 ** 3)
|
70
|
+
return free_gb >= min_free_gb
|
71
|
+
except Exception:
|
72
|
+
return True # Assume OK if check fails
|