rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -1,447 +1,414 @@
1
1
  # analysis.py
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import re
6
+ import tarfile
7
+ import requests
8
+ import logging
9
+ import tempfile
10
+ from typing import Any, Dict, List, Optional, Union
2
11
  import os
3
12
  import sys
4
- import re
5
- import io
6
13
  import time
7
- import tarfile
8
14
  import subprocess
9
- import requests
10
- from bs4 import BeautifulSoup
11
-
12
- # Import utilities with fallback
13
- try:
14
- # Add the parent directory to the path to import utils
15
- sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
16
- from utils.rust_code_analyzer import RustCodeAnalyzer
17
- except ImportError:
18
- # Fallback implementation for when utils are not available
19
- class RustCodeAnalyzer:
20
- def __init__(self, code_content):
21
- self.code_content = code_content
22
-
23
- def analyze(self):
24
- return {
25
- "functions": [],
26
- "structs": [],
27
- "enums": [],
28
- "traits": [],
29
- "complexity": 0,
30
- "lines_of_code": len(self.code_content.split('\n'))
31
- }
32
- from typing import Dict, List
15
+ from dataclasses import dataclass
16
+
33
17
  from .config import EnrichedCrate
34
18
 
35
- # Import atomic utilities for code reuse
36
- import sys
37
- sys.path.append(os.path.dirname(os.path.dirname(__file__)))
19
+ # Create a fallback RustCodeAnalyzer that doesn't depend on external utils
20
+ class RustCodeAnalyzer:
21
+ """Fallback Rust code analyzer for when the full analyzer is not available."""
22
+
23
+ def __init__(self, code_content: str) -> None:
24
+ self.code_content = code_content
38
25
 
26
+ def analyze(self) -> dict[str, Any]:
27
+ """Basic analysis of Rust code content."""
28
+ lines = self.code_content.split('\n')
29
+ return {
30
+ "functions": self._count_functions(),
31
+ "structs": self._count_structs(),
32
+ "enums": self._count_enums(),
33
+ "traits": self._count_traits(),
34
+ "complexity": self._calculate_complexity(),
35
+ "lines_of_code": len(lines),
36
+ }
39
37
 
40
- class SourceAnalyzer:
41
- @staticmethod
42
- def analyze_crate_source(crate: EnrichedCrate) -> Dict:
43
- """Orchestrate source analysis from multiple sources"""
44
- crate_name = crate.name
45
- version = crate.version
46
- repo_url = crate.repository
38
+ def _count_functions(self) -> int:
39
+ """Count function definitions."""
40
+ return len(re.findall(r'fn\s+\w+\s*\(', self.code_content))
47
41
 
48
- # Method 1: Try to download from crates.io
49
- try:
50
- url = f"https://crates.io/api/v1/crates/{crate_name}/{version}/download"
51
- response = requests.get(url, stream=True)
42
+ def _count_structs(self) -> int:
43
+ """Count struct definitions."""
44
+ return len(re.findall(r'struct\s+\w+', self.code_content))
52
45
 
53
- if response.ok:
54
- # We got the tarball, analyze it
55
- return SourceAnalyzer.analyze_crate_tarball(response.content)
56
- except Exception as e:
57
- print(f"Failed to download from crates.io: {str(e)}")
46
+ def _count_enums(self) -> int:
47
+ """Count enum definitions."""
48
+ return len(re.findall(r'enum\s+\w+', self.code_content))
58
49
 
59
- # Method 2: Try GitHub if we have a GitHub URL
60
- if "github.com" in repo_url:
61
- try:
62
- # Extract owner/repo from URL
63
- match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
64
- if match:
65
- owner, repo_name = match.groups()
66
- repo_name = repo_name.split(
67
- '.')[0] # Remove .git extension
68
-
69
- # Try to download tarball from GitHub
70
- github_url = f"https://api.github.com/repos/{owner}/{repo_name}/tarball"
71
- response = requests.get(github_url)
72
-
73
- if response.ok:
74
- return SourceAnalyzer.analyze_github_tarball(
75
- response.content)
76
- except Exception as e:
77
- print(f"Failed to analyze from GitHub: {str(e)}")
50
+ def _count_traits(self) -> int:
51
+ """Count trait definitions."""
52
+ return len(re.findall(r'trait\s+\w+', self.code_content))
78
53
 
79
- # Method 3: Try lib.rs
80
- try:
81
- # lib.rs doesn't have a direct download API, but redirects to crates.io or
82
- # GitHub
83
- url = f"https://lib.rs/crates/{crate_name}"
84
- response = requests.get(url)
85
-
86
- if response.ok:
87
- soup = BeautifulSoup(response.text, 'html.parser')
88
-
89
- # Look for repository links
90
- repo_links = soup.select('a[href*="github.com"]')
91
- if repo_links:
92
- repo_url = repo_links[0]['href']
93
-
94
- # We found a GitHub link, now analyze it
95
- return SourceAnalyzer.analyze_crate_source_from_repo(
96
- crate_name, version, repo_url)
97
- except Exception as e:
98
- print(f"Failed to analyze from lib.rs: {str(e)}")
54
+ def _calculate_complexity(self) -> int:
55
+ """Calculate basic cyclomatic complexity."""
56
+ complexity = 0
57
+ complexity += len(re.findall(r'\bif\b', self.code_content))
58
+ complexity += len(re.findall(r'\bfor\b', self.code_content))
59
+ complexity += len(re.findall(r'\bwhile\b', self.code_content))
60
+ complexity += len(re.findall(r'\bmatch\b', self.code_content))
61
+ return complexity
99
62
 
100
- # If we get here, we failed to analyze from any source
63
+ @staticmethod
64
+ def create_empty_metrics() -> dict[str, Any]:
65
+ """Create empty metrics structure."""
101
66
  return {
102
- "error": "Could not analyze crate from any source",
103
- "attempted_sources": ["crates.io", "github", "lib.rs"],
67
+ "functions": 0,
68
+ "structs": 0,
69
+ "enums": 0,
70
+ "traits": 0,
71
+ "complexity": 0,
72
+ "lines_of_code": 0,
104
73
  "file_count": 0,
105
- "loc": 0
106
- } @ staticmethod
74
+ }
107
75
 
108
- def analyze_crate_tarball(content: bytes) -> Dict:
109
- """Analyze a .crate tarball from crates.io - refactored to use atomic utilities"""
110
- metrics = RustCodeAnalyzer.create_empty_metrics()
76
+ @staticmethod
77
+ def detect_project_structure(files: list[str]) -> dict[str, bool]:
78
+ """Detect basic project structure."""
79
+ return {
80
+ "has_cargo_toml": any("Cargo.toml" in f for f in files),
81
+ "has_src": any("/src/" in f for f in files),
82
+ "has_tests": any("/tests/" in f for f in files),
83
+ "has_examples": any("/examples/" in f for f in files),
84
+ }
111
85
 
112
- try:
113
- # Open the tar file from the content
114
- tar_content = io.BytesIO(content)
115
- with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
116
- # Get list of Rust files
117
- rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
118
- metrics["file_count"] = len(rust_files)
86
+ @staticmethod
87
+ def analyze_rust_content(content: str) -> dict[str, Any]:
88
+ """Analyze Rust content."""
89
+ analyzer = RustCodeAnalyzer(content)
90
+ return analyzer.analyze()
119
91
 
120
- # Check for test/example/bench directories using atomic utility
121
- all_files = tar.getnames()
122
- structure = RustCodeAnalyzer.detect_project_structure(
123
- all_files)
124
-
125
- # Analyze each Rust file using atomic utility
126
- for filename in rust_files:
127
- try:
128
- member = tar.getmember(filename)
129
- if member.isfile():
130
- file_content = tar.extractfile(member)
131
- if file_content:
132
- content_str = file_content.read().decode('utf-8', errors='ignore')
133
-
134
- # Use atomic content analysis
135
- content_analysis = RustCodeAnalyzer.analyze_rust_content(
136
- content_str)
137
- metrics = RustCodeAnalyzer.aggregate_metrics(
138
- metrics, content_analysis, structure)
92
+ @staticmethod
93
+ def aggregate_metrics(
94
+ metrics: dict[str, Any],
95
+ content_analysis: dict[str, Any],
96
+ structure: dict[str, bool],
97
+ ) -> dict[str, Any]:
98
+ """Aggregate metrics from multiple sources."""
99
+ for key, value in content_analysis.items():
100
+ if isinstance(value, (int, float)):
101
+ metrics[key] = metrics.get(key, 0) + value
102
+ elif isinstance(value, list):
103
+ if key not in metrics:
104
+ metrics[key] = []
105
+ metrics[key].extend(value)
106
+
107
+ # Add structure information
108
+ metrics.update(structure)
109
+ return metrics
139
110
 
140
- except Exception as e:
141
- print(f"Error analyzing file {filename}: {str(e)}")
142
111
 
143
- except Exception as e:
144
- metrics["error"] = str(e)
112
+ # Constants for URLs and paths
113
+ CRATES_IO_API_URL = "https://crates.io/api/v1/crates"
114
+ GITHUB_API_URL = "https://api.github.com/repos"
115
+ LIB_RS_URL = "https://lib.rs/crates"
145
116
 
146
- return metrics @ staticmethod
147
117
 
148
- def analyze_github_tarball(content: bytes) -> Dict:
149
- """Analyze a GitHub tarball - refactored to use atomic utilities"""
150
- metrics = RustCodeAnalyzer.create_empty_metrics()
118
+ class SourceAnalyzer:
119
+ @staticmethod
120
+ def analyze_crate_source(crate: EnrichedCrate) -> dict[str, Any]:
121
+ """Orchestrate source analysis from multiple sources."""
122
+ repo_url = crate.repository
151
123
 
124
+ # Method 1: Try to download from crates.io
152
125
  try:
153
- # GitHub tarballs are typically gzipped tar files
154
- tar_content = io.BytesIO(content)
155
- with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
156
- # GitHub tarballs include the repo name and commit as the top dir
157
- # So we need to handle the different structure
158
- rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
159
- metrics["file_count"] = len(rust_files)
126
+ url = f"{CRATES_IO_API_URL}/{crate.name}/{crate.version}/download"
127
+ response = requests.get(url, stream=True, timeout=30)
128
+ response.raise_for_status()
129
+ logging.info(f"Successfully downloaded {crate.name} from crates.io")
130
+ return SourceAnalyzer.analyze_crate_tarball(response.content)
131
+ except requests.RequestException as e:
132
+ logging.warning(f"Failed to download from crates.io: {e}")
160
133
 
161
- # Check for test/example/bench directories using atomic utility
162
- all_files = tar.getnames()
163
- structure = RustCodeAnalyzer.detect_project_structure(
164
- all_files)
165
-
166
- # Analyze each Rust file using atomic utility (same as crate
167
- # tarball)
168
- for filename in rust_files:
169
- try:
170
- member = tar.getmember(filename)
171
- if member.isfile():
172
- file_content = tar.extractfile(member)
173
- if file_content:
174
- content_str = file_content.read().decode('utf-8', errors='ignore')
175
-
176
- # Use atomic content analysis
177
- content_analysis = RustCodeAnalyzer.analyze_rust_content(
178
- content_str)
179
- metrics = RustCodeAnalyzer.aggregate_metrics(
180
- metrics, content_analysis, structure)
134
+ # Method 2: Try GitHub if we have a GitHub URL
135
+ if repo_url and "github.com" in repo_url:
136
+ match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
137
+ if match:
138
+ owner, repo_name = match.groups()
139
+ repo_name = repo_name.replace(".git", "")
140
+ try:
141
+ github_url = f"{GITHUB_API_URL}/{owner}/{repo_name}/tarball"
142
+ response = requests.get(github_url, timeout=30)
143
+ response.raise_for_status()
144
+ logging.info(f"Successfully downloaded {crate.name} from GitHub")
145
+ return SourceAnalyzer.analyze_github_tarball(response.content)
146
+ except requests.RequestException as e:
147
+ logging.warning(f"Failed to analyze from GitHub: {e}")
148
+
149
+ # Method 3: Fallback to cloning from the repository directly
150
+ if repo_url:
151
+ try:
152
+ logging.info(f"Attempting to clone repository for {crate.name}")
153
+ return SourceAnalyzer.analyze_crate_source_from_repo(repo_url)
154
+ except Exception as e:
155
+ logging.error(f"Failed to clone and analyze repository {repo_url}: {e}")
181
156
 
182
- except Exception as e:
183
- print(f"Error analyzing file {filename}: {str(e)}")
157
+ return {
158
+ "error": "Could not analyze crate from any available source.",
159
+ "attempted_sources": ["crates.io", "github", "git_clone"],
160
+ "file_count": 0,
161
+ "loc": 0,
162
+ }
184
163
 
185
- except Exception as e:
186
- metrics["error"] = str(e)
164
+ @staticmethod
165
+ def _analyze_tarball_content(content: bytes) -> dict[str, Any]:
166
+ """Shared logic to analyze tarball content from any source."""
167
+ metrics = RustCodeAnalyzer.create_empty_metrics()
168
+ try:
169
+ with io.BytesIO(content) as tar_content, tarfile.open(
170
+ fileobj=tar_content, mode="r:gz"
171
+ ) as tar:
172
+ rust_files = [f for f in tar.getnames() if f.endswith(".rs")]
173
+ metrics["file_count"] = len(rust_files)
174
+ structure = RustCodeAnalyzer.detect_project_structure(tar.getnames())
175
+
176
+ for member in tar.getmembers():
177
+ if member.isfile() and member.name.endswith(".rs"):
178
+ file_content = tar.extractfile(member)
179
+ if file_content:
180
+ try:
181
+ content_str = file_content.read().decode("utf-8")
182
+ analysis = RustCodeAnalyzer.analyze_rust_content(
183
+ content_str
184
+ )
185
+ metrics = RustCodeAnalyzer.aggregate_metrics(
186
+ metrics, analysis, structure
187
+ )
188
+ except UnicodeDecodeError:
189
+ logging.warning(
190
+ f"Skipping non-UTF-8 file: {member.name}"
191
+ )
192
+ except tarfile.TarError as e:
193
+ metrics["error"] = f"Failed to read tarball: {e}"
194
+ logging.error(metrics["error"])
195
+ return metrics
196
+
197
+ @staticmethod
198
+ def analyze_crate_tarball(content: bytes) -> dict[str, Any]:
199
+ """Analyze a .crate tarball from crates.io."""
200
+ return SourceAnalyzer._analyze_tarball_content(content)
187
201
 
188
- return metrics @ staticmethod
202
+ @staticmethod
203
+ def analyze_github_tarball(content: bytes) -> dict[str, Any]:
204
+ """Analyze a GitHub tarball."""
205
+ return SourceAnalyzer._analyze_tarball_content(content)
189
206
 
190
- def analyze_local_directory(directory: str) -> Dict:
191
- """Analyze source code from a local directory - refactored to use atomic utilities"""
207
+ @staticmethod
208
+ def analyze_local_directory(directory: str) -> dict[str, Any]:
209
+ """Analyze source code from a local directory."""
192
210
  metrics = RustCodeAnalyzer.create_empty_metrics()
193
-
194
211
  try:
195
- # Find all Rust files
196
- rust_files = []
197
- for root, _, files in os.walk(directory):
198
- if "target" in root or ".git" in root: # Skip build dirs and git
199
- continue
200
- rust_files.extend([os.path.join(root, f)
201
- for f in files if f.endswith(".rs")])
212
+ rust_files: list[str] = []
213
+ all_paths: list[str] = []
214
+ for root, dirs, files in os.walk(directory):
215
+ # Exclude target and .git directories
216
+ dirs[:] = [d for d in dirs if d not in ["target", ".git"]]
217
+ for file in files:
218
+ full_path = os.path.join(root, file)
219
+ all_paths.append(full_path)
220
+ if file.endswith(".rs"):
221
+ rust_files.append(full_path)
202
222
 
203
223
  metrics["file_count"] = len(rust_files)
224
+ structure = RustCodeAnalyzer.detect_project_structure(all_paths)
204
225
 
205
- # Check if the crate has tests/examples/benchmarks using atomic
206
- # utility
207
- project_dirs = [
208
- d for d in os.listdir(directory) if os.path.isdir(
209
- os.path.join(
210
- directory, d))]
211
- structure = RustCodeAnalyzer.detect_project_structure(
212
- project_dirs + ["tests", "examples", "benches"])
213
-
214
- # Override with actual directory checks
215
- structure["has_tests"] = any(
216
- os.path.exists(
217
- os.path.join(
218
- directory,
219
- d)) for d in [
220
- "tests",
221
- "test"])
222
- structure["has_examples"] = os.path.exists(
223
- os.path.join(directory, "examples"))
224
- structure["has_benchmarks"] = os.path.exists(
225
- os.path.join(directory, "benches"))
226
-
227
- # Analyze each Rust file using atomic utility
228
226
  for file_path in rust_files:
229
227
  try:
230
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
228
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
231
229
  content = f.read()
232
-
233
- # Use atomic content analysis
234
- content_analysis = RustCodeAnalyzer.analyze_rust_content(
235
- content)
230
+ analysis = RustCodeAnalyzer.analyze_rust_content(content)
236
231
  metrics = RustCodeAnalyzer.aggregate_metrics(
237
- metrics, content_analysis, structure)
238
-
232
+ metrics, analysis, structure
233
+ )
239
234
  except Exception as e:
240
- print(f"Error analyzing file {file_path}: {str(e)}")
241
-
235
+ logging.warning(f"Error analyzing file {file_path}: {e}")
242
236
  except Exception as e:
243
- metrics["error"] = str(e)
244
-
237
+ metrics["error"] = f"Failed to analyze local directory {directory}: {e}"
238
+ logging.error(metrics["error"])
245
239
  return metrics
246
240
 
247
241
  @staticmethod
248
- def analyze_crate_source_from_repo(
249
- crate_name: str,
250
- version: str,
251
- repo_url: str) -> Dict:
252
- """Clone and analyze a crate's source code from repository"""
253
- temp_dir = f"/tmp/rust_analysis/{crate_name}"
254
- os.makedirs(temp_dir, exist_ok=True)
255
-
256
- try:
257
- # Clone repository
258
- if not os.path.exists(f"{temp_dir}/.git"):
259
- subprocess.run(["git",
260
- "clone",
261
- "--depth=1",
262
- repo_url,
263
- temp_dir],
264
- capture_output=True,
265
- text=True,
266
- check=True)
267
-
268
- return SourceAnalyzer.analyze_local_directory(temp_dir)
269
-
270
- except Exception as e:
271
- return {
272
- "error": f"Failed to clone and analyze repository: {str(e)}",
273
- "file_count": 0,
274
- "loc": 0
275
- }
276
- finally:
277
- # Clean up (optional)
278
- # subprocess.run(["rm", "-r", temp_dir], capture_output=True)
279
- pass
242
+ def analyze_crate_source_from_repo(repo_url: str) -> dict[str, Any]:
243
+ """Clone and analyze a crate's source code from a repository."""
244
+ with tempfile.TemporaryDirectory() as temp_dir:
245
+ try:
246
+ logging.info(f"Cloning {repo_url} into {temp_dir}")
247
+ subprocess.run(
248
+ ["git", "clone", "--depth=1", repo_url, temp_dir],
249
+ capture_output=True,
250
+ text=True,
251
+ check=True,
252
+ timeout=120,
253
+ )
254
+ return SourceAnalyzer.analyze_local_directory(temp_dir)
255
+ except (
256
+ subprocess.CalledProcessError,
257
+ subprocess.TimeoutExpired,
258
+ ) as e:
259
+ error_output = ""
260
+ if hasattr(e, "stderr") and e.stderr:
261
+ error_output = e.stderr.decode("utf-8", "ignore")
262
+ else:
263
+ error_output = str(e)
264
+ logging.error(f"Failed to clone repository {repo_url}: {error_output}")
265
+ return {
266
+ "error": f"Failed to clone repository: {error_output}",
267
+ "file_count": 0,
268
+ "loc": 0,
269
+ }
280
270
 
281
271
 
282
272
  class SecurityAnalyzer:
283
273
  @staticmethod
284
- def check_security_metrics(crate: EnrichedCrate) -> Dict:
285
- """Check security metrics for a crate"""
286
- security_data = {
274
+ def check_security_metrics(crate: EnrichedCrate) -> dict[str, Any]:
275
+ """Check security metrics for a crate (placeholder)."""
276
+ security_data: dict[str, Any] = {
287
277
  "advisories": [],
288
278
  "vulnerability_count": 0,
289
279
  "cargo_audit": None,
290
- "clippy_warnings": 0,
291
- "test_coverage": None
280
+ "unsafe_blocks": 0,
292
281
  }
293
-
294
- crate_name = crate.name
295
- version = crate.version
296
-
297
- # Check RustSec Advisory Database
298
- try:
299
- # This would require the RustSec advisory database
300
- # For now, just return placeholder data
301
- advisories_url = f"https://rustsec.org/advisories/{crate_name}.json"
302
- response = requests.get(advisories_url)
303
- if response.ok:
304
- advisories = response.json()
305
- security_data["advisories"] = advisories
306
- security_data["vulnerability_count"] = len(advisories)
307
- except Exception:
308
- pass
309
-
310
- # Check for common security patterns in code
311
- try:
312
- # This would analyze the source code for unsafe blocks, etc.
313
- # Placeholder for now
314
- security_data["unsafe_blocks"] = 0
315
- security_data["security_patterns"] = []
316
- except Exception:
317
- pass
318
-
282
+ # In a real implementation, this would run tools like `cargo-audit`
283
+ # and parse the output. For now, it remains a placeholder.
284
+ logging.info(f"Running placeholder security check for {crate.name}")
319
285
  return security_data
320
286
 
321
287
 
322
288
  class UserBehaviorAnalyzer:
323
289
  @staticmethod
324
- def fetch_user_behavior_data(crate: EnrichedCrate) -> Dict:
325
- """Fetch user behavior data from GitHub and crates.io"""
326
- result = {
290
+ def _get_github_headers() -> dict[str, str]:
291
+ """Get headers for GitHub API requests, including auth if available."""
292
+ headers = {"Accept": "application/vnd.github.v3+json"}
293
+ if token := os.environ.get("GITHUB_TOKEN"):
294
+ headers["Authorization"] = f"token {token}"
295
+ return headers
296
+
297
+ @staticmethod
298
+ def fetch_user_behavior_data(crate: EnrichedCrate) -> dict[str, Any]:
299
+ """Fetch user behavior data from GitHub and crates.io."""
300
+ result: dict[str, Any] = {
327
301
  "issues": [],
328
302
  "pull_requests": [],
329
303
  "version_adoption": {},
330
- "community_metrics": {}
304
+ "community_metrics": {},
331
305
  }
332
-
333
- crate_name = crate.name
334
306
  repo_url = crate.repository
335
-
336
- # Extract owner/repo from URL
337
307
  if not repo_url or "github.com" not in repo_url:
338
308
  return result
339
309
 
340
- parts = repo_url.rstrip('/').split('/')
341
- if len(parts) < 2:
310
+ match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
311
+ if not match:
342
312
  return result
343
- owner, repo = parts[-2], parts[-1]
313
+ owner, repo = match.groups()
314
+ repo = repo.replace(".git", "")
344
315
 
345
- # Setup GitHub API access - use token if available
346
- headers = {"Accept": "application/vnd.github.v3+json"}
347
- if os.environ.get("GITHUB_TOKEN"):
348
- headers["Authorization"] = f"token {
349
- os.environ.get('GITHUB_TOKEN')}"
316
+ headers = UserBehaviorAnalyzer._get_github_headers()
317
+ UserBehaviorAnalyzer._fetch_github_activity(owner, repo, headers, result)
318
+ UserBehaviorAnalyzer._fetch_crates_io_versions(crate.name, result)
350
319
 
351
- # Fetch recent issues and PRs
352
- try:
353
- # Get issues (last 30)
354
- issues_url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=30"
355
- issues_resp = requests.get(issues_url, headers=headers)
356
- if issues_resp.ok:
357
- issues_data = issues_resp.json()
358
-
359
- # Process issue data
360
- for issue in issues_data:
361
- if "pull_request" in issue:
362
- # This is a PR, not an issue
363
- result["pull_requests"].append({
364
- "number": issue["number"],
365
- "title": issue["title"],
366
- "state": issue["state"],
367
- "created_at": issue["created_at"],
368
- "closed_at": issue["closed_at"],
369
- "url": issue["html_url"]
370
- })
371
- else:
372
- # Regular issue
373
- result["issues"].append({
374
- "number": issue["number"],
375
- "title": issue["title"],
376
- "state": issue["state"],
377
- "created_at": issue["created_at"],
378
- "closed_at": issue["closed_at"],
379
- "url": issue["html_url"]
380
- })
381
-
382
- # Fetch commit activity for the past year
383
- commits_url = f"https://api.github.com/repos/{owner}/{repo}/stats/commit_activity"
384
- commits_resp = requests.get(commits_url, headers=headers)
385
- if commits_resp.ok:
386
- result["community_metrics"]["commit_activity"] = commits_resp.json()
387
-
388
- # Rate limiting - be nice to GitHub API
389
- time.sleep(1)
390
- except Exception as e:
391
- print(f"Error fetching GitHub data: {str(e)}")
320
+ return result
392
321
 
393
- # Get version adoption data from crates.io
322
+ @staticmethod
323
+ def _fetch_github_activity(
324
+ owner: str, repo: str, headers: dict[str, str], result: dict[str, Any]
325
+ ) -> None:
326
+ """Fetch issues, PRs, and commit activity from GitHub."""
394
327
  try:
395
- versions_url = f"https://crates.io/api/v1/crates/{crate_name}/versions"
396
- versions_resp = requests.get(versions_url)
397
- if versions_resp.ok:
398
- versions_data = versions_resp.json()
399
- versions = versions_data.get("versions", [])
400
-
401
- # Process version data
402
- for version in versions[:10]: # Top 10 versions
403
- version_num = version["num"]
404
- downloads = version["downloads"]
405
- created_at = version["created_at"]
406
-
407
- result["version_adoption"][version_num] = {
408
- "downloads": downloads,
409
- "created_at": created_at
328
+ issues_url = f"{GITHUB_API_URL}/{owner}/{repo}/issues?state=all&per_page=30"
329
+ issues_resp = requests.get(issues_url, headers=headers, timeout=30)
330
+ issues_resp.raise_for_status()
331
+
332
+ for item in issues_resp.json():
333
+ is_pr = "pull_request" in item
334
+ data_list = result["pull_requests"] if is_pr else result["issues"]
335
+ data_list.append(
336
+ {
337
+ "number": item["number"],
338
+ "title": item["title"],
339
+ "state": item["state"],
340
+ "created_at": item["created_at"],
341
+ "closed_at": item["closed_at"],
342
+ "url": item["html_url"],
410
343
  }
411
- except Exception as e:
412
- print(f"Error fetching crates.io version data: {str(e)}")
344
+ )
345
+
346
+ # Fetch commit activity (retries on 202)
347
+ activity_url = f"{GITHUB_API_URL}/{owner}/{repo}/stats/commit_activity"
348
+ for _ in range(3): # Retry up to 3 times
349
+ activity_resp = requests.get(activity_url, headers=headers, timeout=60)
350
+ if activity_resp.status_code == 200:
351
+ result["community_metrics"][
352
+ "commit_activity"
353
+ ] = activity_resp.json()
354
+ break
355
+ elif activity_resp.status_code == 202:
356
+ logging.info(
357
+ f"GitHub is calculating stats for {owner}/{repo}, waiting..."
358
+ )
359
+ time.sleep(2)
360
+ else:
361
+ activity_resp.raise_for_status()
362
+
363
+ except requests.RequestException as e:
364
+ logging.warning(f"Error fetching GitHub data for {owner}/{repo}: {e}")
413
365
 
414
- return result
366
+ @staticmethod
367
+ def _fetch_crates_io_versions(crate_name: str, result: dict[str, Any]) -> None:
368
+ """Fetch version adoption data from crates.io."""
369
+ try:
370
+ versions_url = f"{CRATES_IO_API_URL}/{crate_name}/versions"
371
+ versions_resp = requests.get(versions_url, timeout=30)
372
+ versions_resp.raise_for_status()
373
+ versions_data = versions_resp.json().get("versions", [])
374
+
375
+ for version in versions_data[:10]: # Top 10 versions
376
+ result["version_adoption"][version["num"]] = {
377
+ "downloads": version["downloads"],
378
+ "created_at": version["created_at"],
379
+ }
380
+ except requests.RequestException as e:
381
+ logging.warning(
382
+ f"Error fetching crates.io version data for {crate_name}: {e}"
383
+ )
415
384
 
416
385
 
417
386
  class DependencyAnalyzer:
418
387
  @staticmethod
419
- def analyze_dependencies(crates: List[EnrichedCrate]) -> Dict:
420
- """Analyze dependencies between crates"""
421
- dependency_graph = {}
388
+ def analyze_dependencies(crates: list[EnrichedCrate]) -> dict[str, Any]:
389
+ """Analyze dependencies within a given list of crates."""
422
390
  crate_names = {crate.name for crate in crates}
391
+ dependency_graph: dict[str, list[str]] = {
392
+ crate.name: [
393
+ dep_id
394
+ for dep in crate.dependencies
395
+ if (dep_id := dep.get("crate_id")) and dep_id in crate_names
396
+ ]
397
+ for crate in crates
398
+ }
423
399
 
424
- for crate in crates:
425
- deps = []
426
- for dep in crate.dependencies:
427
- if dep.get("crate_id") in crate_names:
428
- deps.append(dep.get("crate_id"))
429
- dependency_graph[crate.name] = deps
430
-
431
- # Find most depended-upon crates
432
- reverse_deps = {}
400
+ reverse_deps: dict[str, list[str]] = {}
433
401
  for crate_name, deps in dependency_graph.items():
434
402
  for dep in deps:
435
- if dep not in reverse_deps:
436
- reverse_deps[dep] = []
437
- reverse_deps[dep].append(crate_name)
403
+ if dep: # Ensure dep is not None
404
+ reverse_deps.setdefault(dep, []).append(crate_name)
405
+
406
+ most_depended = sorted(
407
+ reverse_deps.items(), key=lambda item: len(item[1]), reverse=True
408
+ )[:10]
438
409
 
439
410
  return {
440
411
  "dependency_graph": dependency_graph,
441
412
  "reverse_dependencies": reverse_deps,
442
- "most_depended": sorted(
443
- reverse_deps.items(),
444
- key=lambda x: len(
445
- x[1]),
446
- reverse=True)[
447
- :10]}
413
+ "most_depended": most_depended,
414
+ }