rust-crate-pipeline 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ # network.py
2
+ import os
3
+ import re
4
+ import time
5
+ import logging
6
+ import requests
7
+ from requests_cache import CachedSession
8
+ from bs4 import BeautifulSoup
9
+ from typing import Dict, List, Optional
10
+ from .config import PipelineConfig
11
+
12
+ class GitHubBatchClient:
13
+ def __init__(self, config: PipelineConfig):
14
+ self.config = config
15
+ self.headers = {"Accept": "application/vnd.github.v3+json"}
16
+ if config.github_token:
17
+ self.headers["Authorization"] = f"token {config.github_token}"
18
+
19
+ self.session = CachedSession(
20
+ 'github_cache',
21
+ expire_after=config.cache_ttl * 2 # Longer cache for GitHub
22
+ )
23
+ self.remaining_calls = 5000
24
+ self.reset_time = 0
25
+
26
+ def check_rate_limit(self):
27
+ """Check and update current rate limit status"""
28
+ try:
29
+ response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
30
+ if response.ok:
31
+ data = response.json()
32
+ self.remaining_calls = data["resources"]["core"]["remaining"]
33
+ self.reset_time = data["resources"]["core"]["reset"]
34
+
35
+ if self.remaining_calls < 100:
36
+ reset_in = self.reset_time - time.time()
37
+ logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
38
+ except Exception:
39
+ pass
40
+
41
+ def get_repo_stats(self, owner: str, repo: str) -> Dict:
42
+ """Get repository statistics"""
43
+ try:
44
+ url = f"https://api.github.com/repos/{owner}/{repo}"
45
+ response = self.session.get(url, headers=self.headers)
46
+ if response.ok:
47
+ return response.json()
48
+ else:
49
+ logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
50
+ return {}
51
+ except Exception as e:
52
+ logging.error(f"Error fetching repo stats: {str(e)}")
53
+ return {}
54
+
55
+ def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
56
+ """Get statistics for multiple repositories in a batch"""
57
+ self.check_rate_limit()
58
+
59
+ results = {}
60
+ for repo_url in repo_list:
61
+ # Extract owner/repo from URL
62
+ match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
63
+ if not match:
64
+ continue
65
+
66
+ owner, repo = match.groups()
67
+ repo = repo.split('.')[0] # Remove .git extension if present
68
+
69
+ # Get stats
70
+ stats = self.get_repo_stats(owner, repo)
71
+ results[repo_url] = stats
72
+
73
+ # Be nice to GitHub API
74
+ time.sleep(0.1)
75
+
76
+ return results
77
+
78
+ class CrateAPIClient:
79
+ def __init__(self, config: PipelineConfig):
80
+ self.config = config
81
+ self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
82
+
83
+ def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
84
+ """Fetch metadata with retry logic"""
85
+ for attempt in range(self.config.max_retries):
86
+ try:
87
+ return self._fetch_metadata(crate_name)
88
+ except Exception as e:
89
+ logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
90
+ wait = 2 ** attempt
91
+ time.sleep(wait)
92
+ return None
93
+
94
+ def _fetch_metadata(self, crate_name: str) -> Optional[Dict]:
95
+ """Enhanced metadata fetching that tries multiple sources"""
96
+ # First try crates.io (primary source)
97
+ try:
98
+ r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
99
+ if r.ok:
100
+ data = r.json()
101
+ crate_data = data["crate"]
102
+ latest = crate_data["newest_version"]
103
+
104
+ # Get readme
105
+ readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
106
+ readme = readme_response.text if readme_response.ok else ""
107
+
108
+ # Get dependencies
109
+ deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
110
+ deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
111
+
112
+ # Get features - using the versions endpoint
113
+ features = []
114
+ versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
115
+ if versions_response.ok:
116
+ version_data = versions_response.json().get("version", {})
117
+ features_dict = version_data.get("features", {})
118
+ features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
119
+
120
+ # Repository info and GitHub stars
121
+ repo = crate_data.get("repository", "")
122
+ gh_stars = 0
123
+
124
+ # Check if it's a GitHub repo
125
+ if "github.com" in repo and self.config.github_token:
126
+ match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
127
+ if match:
128
+ owner, repo_name = match.groups()
129
+ repo_name = repo_name.split('.')[0] # Handle .git extensions
130
+ gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
131
+ gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
132
+ gh = self.session.get(gh_url, headers=gh_headers)
133
+ if gh.ok:
134
+ gh_data = gh.json()
135
+ gh_stars = gh_data.get("stargazers_count", 0)
136
+
137
+ # Check if it's hosted on lib.rs
138
+ lib_rs_data = {}
139
+ if "lib.rs" in repo:
140
+ lib_rs_url = f"https://lib.rs/crates/{crate_name}"
141
+ lib_rs_response = self.session.get(lib_rs_url)
142
+ if lib_rs_response.ok:
143
+ soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
144
+ # Get README from lib.rs if not already available
145
+ if not readme:
146
+ readme_div = soup.find('div', class_='readme')
147
+ if readme_div:
148
+ readme = readme_div.get_text(strip=True)
149
+
150
+ # Get lib.rs specific stats
151
+ stats_div = soup.find('div', class_='crate-stats')
152
+ if stats_div:
153
+ downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
154
+ if downloads_text:
155
+ lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
156
+
157
+ # Extract code snippets from readme
158
+ code_snippets = self.extract_code_snippets(readme)
159
+
160
+ # Extract sections from readme
161
+ readme_sections = self.extract_readme_sections(readme) if readme else {}
162
+
163
+ result = {
164
+ "name": crate_name,
165
+ "version": latest,
166
+ "description": crate_data.get("description", ""),
167
+ "repository": repo,
168
+ "keywords": crate_data.get("keywords", []),
169
+ "categories": crate_data.get("categories", []),
170
+ "readme": readme,
171
+ "downloads": crate_data.get("downloads", 0),
172
+ "github_stars": gh_stars,
173
+ "dependencies": deps,
174
+ "code_snippets": code_snippets,
175
+ "features": features,
176
+ "readme_sections": readme_sections,
177
+ **lib_rs_data
178
+ }
179
+
180
+ return result
181
+
182
+ except Exception as e:
183
+ logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
184
+ raise
185
+
186
+ # If crates.io fails, try lib.rs
187
+ try:
188
+ r = self.session.get(f"https://lib.rs/crates/{crate_name}")
189
+ if r.ok:
190
+ soup = BeautifulSoup(r.text, 'html.parser')
191
+
192
+ # Extract metadata from lib.rs page
193
+ name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
194
+
195
+ # Find description
196
+ desc_elem = soup.select_one('.description')
197
+ description = desc_elem.text.strip() if desc_elem else ""
198
+
199
+ # Find repository link
200
+ repo_link = None
201
+ for a in soup.select('a'):
202
+ if 'github.com' in a.get('href', ''):
203
+ repo_link = a['href']
204
+ break
205
+
206
+ # Basic metadata from lib.rs
207
+ return {
208
+ "name": name,
209
+ "version": "latest", # lib.rs doesn't easily expose version
210
+ "description": description,
211
+ "repository": repo_link or "",
212
+ "keywords": [],
213
+ "categories": [],
214
+ "readme": "",
215
+ "downloads": 0,
216
+ "github_stars": 0,
217
+ "dependencies": [],
218
+ "code_snippets": [],
219
+ "features": [],
220
+ "readme_sections": {},
221
+ "source": "lib.rs",
222
+ }
223
+ except Exception:
224
+ pass
225
+
226
+ # Finally, try GitHub search
227
+ try:
228
+ # This is a simplification - GitHub's search API requires authentication
229
+ headers = {}
230
+ if self.config.github_token:
231
+ headers["Authorization"] = f"token {self.config.github_token}"
232
+
233
+ search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
234
+ r = requests.get(search_url, headers=headers)
235
+
236
+ if r.ok:
237
+ results = r.json().get("items", [])
238
+ if results:
239
+ repo = results[0] # Take first match
240
+
241
+ # Basic metadata from GitHub
242
+ return {
243
+ "name": crate_name,
244
+ "version": "unknown",
245
+ "description": repo.get("description", ""),
246
+ "repository": repo.get("html_url", ""),
247
+ "keywords": [],
248
+ "categories": [],
249
+ "readme": "",
250
+ "downloads": 0,
251
+ "github_stars": repo.get("stargazers_count", 0),
252
+ "dependencies": [],
253
+ "code_snippets": [],
254
+ "features": [],
255
+ "readme_sections": {},
256
+ "source": "github",
257
+ }
258
+ except Exception:
259
+ pass
260
+
261
+ # If all sources fail
262
+ return None
263
+
264
+ def extract_code_snippets(self, readme: str) -> List[str]:
265
+ """Extract code snippets from markdown README"""
266
+ snippets = []
267
+ if not readme:
268
+ return snippets
269
+
270
+ # Find Rust code blocks
271
+ pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
272
+ matches = re.findall(pattern, readme)
273
+
274
+ for code in matches:
275
+ if len(code.strip()) > 10: # Only include non-trivial snippets
276
+ snippets.append(code.strip())
277
+
278
+ return snippets[:5] # Limit to 5 snippets
279
+
280
+ def extract_readme_sections(self, readme: str) -> Dict[str, str]:
281
+ """Extract sections from README based on markdown headers"""
282
+ if not readme:
283
+ return {}
284
+
285
+ sections = {}
286
+ lines = readme.split('\n')
287
+ current_section = ""
288
+ current_content = []
289
+
290
+ for line in lines:
291
+ if re.match(r'^#+\s+', line): # It's a header
292
+ # Save previous section
293
+ if current_section and current_content:
294
+ sections[current_section] = '\n'.join(current_content).strip()
295
+
296
+ # Start new section
297
+ current_section = re.sub(r'^#+\s+', '', line).strip()
298
+ current_content = []
299
+ else:
300
+ if current_section: # Only collect content if we have a section
301
+ current_content.append(line)
302
+
303
+ # Don't forget the last section
304
+ if current_section and current_content:
305
+ sections[current_section] = '\n'.join(current_content).strip()
306
+
307
+ return sections
@@ -0,0 +1,260 @@
1
+ # pipeline.py
2
+ import os
3
+ import time
4
+ import logging
5
+ import json
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from tqdm import tqdm
8
+ from typing import List, Dict, Optional
9
+ from .config import PipelineConfig, CrateMetadata, EnrichedCrate
10
+ from .network import CrateAPIClient, GitHubBatchClient
11
+ from .ai_processing import LLMEnricher
12
+ from .analysis import SourceAnalyzer, SecurityAnalyzer, UserBehaviorAnalyzer, DependencyAnalyzer
13
+
14
+ class CrateDataPipeline:
15
+ def __init__(self, config: PipelineConfig):
16
+ self.config = config
17
+ self.api_client = CrateAPIClient(config)
18
+ self.github_client = GitHubBatchClient(config)
19
+ self.enricher = LLMEnricher(config)
20
+ self.crates = self.get_crate_list()
21
+ self.output_dir = self._create_output_dir()
22
+
23
+ def _create_output_dir(self) -> str:
24
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
25
+ output_dir = f"crate_data_{timestamp}"
26
+ os.makedirs(output_dir, exist_ok=True)
27
+ return output_dir
28
+
29
+ def get_crate_list(self, limit: Optional[int] = None) -> List[str]:
30
+ """Return a comprehensive list of all high-value crates to process"""
31
+ crates = [
32
+ # Web frameworks
33
+ "actix-web", "rocket", "axum", "warp", "tower",
34
+
35
+ # Async runtimes and utilities
36
+ "tokio", "tokio-stream", "async-trait", "futures",
37
+
38
+ # Serialization/deserialization
39
+ "serde", "serde_json", "serde_yaml", "bincode",
40
+
41
+ # Error handling
42
+ "anyhow", "thiserror",
43
+
44
+ # Utilities
45
+ "rand", "uuid", "chrono", "regex", "log", "env_logger", "clap", "crossterm",
46
+ "itertools", "num", "cfg-if", "bytes", "mime", "form_urlencoded", "parking_lot",
47
+ "csv", "lazy_static", "once_cell", "tracing", "base64", "sha2", "flate2", "tar",
48
+
49
+ # HTTP clients and servers
50
+ "reqwest", "hyper",
51
+
52
+ # Database
53
+ "sqlx", "diesel", "postgres", "rusqlite",
54
+
55
+ # Concurrency
56
+ "rayon",
57
+
58
+ # Protocol buffers and gRPC
59
+ "prost", "tonic",
60
+
61
+ # Procedural macros
62
+ "syn", "quote", "proc-macro2",
63
+
64
+ # Machine Learning & AI
65
+ "tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
66
+ "tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
67
+ "tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
68
+ "tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
69
+ "candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
70
+ "candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
71
+ "onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
72
+ "llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
73
+ "llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
74
+ "genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
75
+ "rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
76
+ "toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
77
+ ]
78
+
79
+ if limit is not None:
80
+ return crates[:limit]
81
+ return crates
82
+
83
+ def fetch_metadata_batch(self, crate_names: List[str]) -> List[CrateMetadata]:
84
+ """Fetch metadata for a batch of crates in parallel"""
85
+ with ThreadPoolExecutor(max_workers=self.config.n_workers) as executor:
86
+ futures = {executor.submit(self.api_client.fetch_crate_metadata, name): name
87
+ for name in crate_names}
88
+
89
+ results = []
90
+ for future in as_completed(futures):
91
+ crate_name = futures[future]
92
+ try:
93
+ data = future.result()
94
+ if data:
95
+ # Convert dict to CrateMetadata
96
+ crate_metadata = CrateMetadata(
97
+ name=data.get("name", ""),
98
+ version=data.get("version", ""),
99
+ description=data.get("description", ""),
100
+ repository=data.get("repository", ""),
101
+ keywords=data.get("keywords", []),
102
+ categories=data.get("categories", []),
103
+ readme=data.get("readme", ""),
104
+ downloads=data.get("downloads", 0),
105
+ github_stars=data.get("github_stars", 0),
106
+ dependencies=data.get("dependencies", []),
107
+ features=data.get("features", []),
108
+ code_snippets=data.get("code_snippets", []),
109
+ readme_sections=data.get("readme_sections", {}),
110
+ librs_downloads=data.get("librs_downloads"),
111
+ source=data.get("source", "crates.io")
112
+ )
113
+ results.append(crate_metadata)
114
+ logging.info(f"Fetched metadata for {crate_name}")
115
+ except Exception as e:
116
+ logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
117
+
118
+ return results
119
+
120
+ def enrich_batch(self, batch: List[CrateMetadata]) -> List[EnrichedCrate]:
121
+ """Enrich a batch of crates with GitHub stats and AI"""
122
+ # Add GitHub stats first
123
+ github_repos = [c.repository for c in batch if "github.com" in c.repository]
124
+ repo_stats = self.github_client.batch_get_repo_stats(github_repos)
125
+
126
+ # Update crates with GitHub info
127
+ for crate in batch:
128
+ repo_url = crate.repository
129
+ if repo_url in repo_stats:
130
+ stats = repo_stats[repo_url]
131
+ crate.github_stars = stats.get("stargazers_count", 0)
132
+
133
+ # Now enrich with AI
134
+ enriched_batch = []
135
+ for crate in batch:
136
+ try:
137
+ enriched = self.enricher.enrich_crate(crate)
138
+ enriched_batch.append(enriched)
139
+ logging.info(f"Enriched {crate.name}")
140
+ except Exception as e:
141
+ logging.error(f"Failed to enrich {crate.name}: {str(e)}")
142
+ # Add the crate with just the fields we have
143
+ enriched_dict = crate.__dict__.copy()
144
+ enriched_batch.append(EnrichedCrate(**enriched_dict))
145
+
146
+ return enriched_batch
147
+
148
+ def analyze_dependencies(self, crates: List[EnrichedCrate]) -> Dict:
149
+ """Analyze dependencies between crates"""
150
+ return DependencyAnalyzer.analyze_dependencies(crates)
151
+
152
+ def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
153
+ """Save processing checkpoint with status metadata"""
154
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
155
+ filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
156
+
157
+ with open(filename, "w") as f:
158
+ for item in data:
159
+ # Convert to dict for serialization
160
+ item_dict = item.__dict__.copy()
161
+ f.write(json.dumps(item_dict) + "\n")
162
+
163
+ # Save status metadata
164
+ status = {
165
+ "timestamp": timestamp,
166
+ "total_crates": len(data),
167
+ "processed_crates": sum(1 for c in data if c.use_case is not None),
168
+ "advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
169
+ "checkpoint_file": filename
170
+ }
171
+
172
+ status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
173
+ with open(status_file, "w") as f:
174
+ json.dump(status, f, indent=2)
175
+
176
+ logging.info(f"Saved checkpoint to {filename}")
177
+ return filename
178
+
179
+ def save_final_output(self, data: List[EnrichedCrate], dependency_data: Dict):
180
+ """Save final enriched data and analysis"""
181
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
182
+
183
+ # Save main enriched data
184
+ final_output = os.path.join(self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl")
185
+ with open(final_output, "w") as f:
186
+ for item in data:
187
+ item_dict = item.__dict__.copy()
188
+ f.write(json.dumps(item_dict) + "\n")
189
+
190
+ # Save dependency analysis
191
+ dep_file = os.path.join(self.output_dir, f"dependency_analysis_{timestamp}.json")
192
+ with open(dep_file, "w") as f:
193
+ json.dump(dependency_data, f, indent=2)
194
+
195
+ # Generate summary report
196
+ summary = {
197
+ "total_crates": len(data),
198
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
199
+ "most_popular": sorted([{
200
+ "name": c.name,
201
+ "score": c.score or 0,
202
+ "downloads": c.downloads,
203
+ "github_stars": c.github_stars
204
+ } for c in data], key=lambda x: x["score"], reverse=True)[:5],
205
+ "most_depended_upon": dependency_data.get("most_depended", [])[:5]
206
+ }
207
+
208
+ summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
209
+ with open(summary_file, "w") as f:
210
+ json.dump(summary, f, indent=2)
211
+
212
+ logging.info(f"Results saved to {self.output_dir}/")
213
+
214
+ def run(self):
215
+ """Main pipeline execution flow"""
216
+ start_time = time.time()
217
+ logging.info(f"Processing {len(self.crates)} crates...")
218
+
219
+ # Process in batches
220
+ all_enriched = []
221
+ crate_batches = [self.crates[i:i+self.config.batch_size]
222
+ for i in range(0, len(self.crates), self.config.batch_size)]
223
+
224
+ for batch_num, batch in enumerate(crate_batches):
225
+ logging.info(f"Processing batch {batch_num+1}/{len(crate_batches)} ({len(batch)} crates)")
226
+
227
+ # Fetch metadata
228
+ batch_data = self.fetch_metadata_batch(batch)
229
+
230
+ # Enrich the batch
231
+ enriched_batch = self.enrich_batch(batch_data)
232
+ all_enriched.extend(enriched_batch)
233
+
234
+ # Save checkpoint after each batch
235
+ self.save_checkpoint(all_enriched, "batch_checkpoint")
236
+ logging.info(f"Completed batch {batch_num+1}, processed {len(all_enriched)}/{len(self.crates)} crates so far")
237
+
238
+ # Optional: Add source analysis for some crates
239
+ if batch_num < 2: # Only do detailed analysis for first 2 batches
240
+ for crate in enriched_batch:
241
+ try:
242
+ crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
243
+ crate.security = SecurityAnalyzer.check_security_metrics(crate)
244
+ crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
245
+ logging.info(f"Advanced analysis completed for {crate.name}")
246
+ except Exception as e:
247
+ logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
248
+
249
+ # Step 3: Perform dependency analysis
250
+ logging.info("Analyzing crate dependencies...")
251
+ dependency_analysis = self.analyze_dependencies(all_enriched)
252
+
253
+ # Save final results
254
+ self.save_final_output(all_enriched, dependency_analysis)
255
+
256
+ # Final summary
257
+ duration = time.time() - start_time
258
+ logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
259
+
260
+ return all_enriched, dependency_analysis
@@ -0,0 +1,72 @@
1
+ # rust_crate_pipeline/utils/file_utils.py
2
+ import os
3
+ import json
4
+ import shutil
5
+ from datetime import datetime
6
+ from typing import List, Dict, Any
7
+
8
+ def create_output_dir(base_name: str = "crate_data") -> str:
9
+ """
10
+ Create timestamped output directory
11
+
12
+ Args:
13
+ base_name: Base name for output directory
14
+
15
+ Returns:
16
+ Path to created directory
17
+ """
18
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
19
+ output_dir = f"{base_name}_{timestamp}"
20
+ os.makedirs(output_dir, exist_ok=True)
21
+ return output_dir
22
+
23
+ def save_checkpoint(data: List[Dict], prefix: str, output_dir: str) -> str:
24
+ """
25
+ Save processing checkpoint with status metadata
26
+
27
+ Args:
28
+ data: List of crate dictionaries
29
+ prefix: File name prefix
30
+ output_dir: Target directory
31
+
32
+ Returns:
33
+ Path to saved checkpoint file
34
+ """
35
+ timestamp = datetime.now().isoformat()
36
+ filename = os.path.join(output_dir, f"{prefix}_{timestamp}.jsonl")
37
+
38
+ with open(filename, "w") as f:
39
+ for item in data:
40
+ f.write(json.dumps(item) + "\n")
41
+
42
+ # Save status metadata
43
+ status = {
44
+ "timestamp": timestamp,
45
+ "total_items": len(data),
46
+ "checkpoint_file": filename
47
+ }
48
+
49
+ status_file = os.path.join(output_dir, f"{prefix}_status_{timestamp}.json")
50
+ with open(status_file, "w") as f:
51
+ json.dump(status, f, indent=2)
52
+
53
+ return filename
54
+
55
+ def safe_file_cleanup(path: str):
56
+ """Safely remove files or directories"""
57
+ try:
58
+ if os.path.isfile(path):
59
+ os.remove(path)
60
+ elif os.path.isdir(path):
61
+ shutil.rmtree(path)
62
+ except Exception as e:
63
+ print(f"Failed to cleanup {path}: {str(e)}")
64
+
65
+ def disk_space_check(min_free_gb: float = 1.0) -> bool:
66
+ """Check if sufficient disk space is available"""
67
+ try:
68
+ free_bytes = shutil.disk_usage(".").free
69
+ free_gb = free_bytes / (1024 ** 3)
70
+ return free_gb >= min_free_gb
71
+ except Exception:
72
+ return True # Assume OK if check fails