rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. rust_crate_pipeline/__init__.py +25 -25
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +309 -200
  4. rust_crate_pipeline/analysis.py +304 -368
  5. rust_crate_pipeline/azure_ai_processing.py +453 -0
  6. rust_crate_pipeline/config.py +57 -19
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +42 -36
  14. rust_crate_pipeline/main.py +386 -102
  15. rust_crate_pipeline/network.py +153 -133
  16. rust_crate_pipeline/pipeline.py +340 -264
  17. rust_crate_pipeline/production_config.py +35 -32
  18. rust_crate_pipeline/scraping/__init__.py +13 -0
  19. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  20. rust_crate_pipeline/unified_llm_processor.py +637 -0
  21. rust_crate_pipeline/unified_pipeline.py +548 -0
  22. rust_crate_pipeline/utils/file_utils.py +45 -14
  23. rust_crate_pipeline/utils/logging_utils.py +34 -17
  24. rust_crate_pipeline/version.py +53 -2
  25. rust_crate_pipeline-1.3.1.dist-info/METADATA +357 -0
  26. rust_crate_pipeline-1.3.1.dist-info/RECORD +30 -0
  27. rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
  28. rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
  29. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/WHEEL +0 -0
  30. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/entry_points.txt +0 -0
  31. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/licenses/LICENSE +0 -0
  32. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,436 +1,372 @@
1
1
  # analysis.py
2
- import os
3
- import re
4
2
  import io
5
- import json
6
- import time
3
+ import re
7
4
  import tarfile
5
+ import requests
6
+ import logging
8
7
  import tempfile
8
+ from typing import Any
9
+ import os
10
+ import sys
11
+ import time
9
12
  import subprocess
10
- import requests
11
- from datetime import datetime
12
- from dateutil.relativedelta import relativedelta
13
- from bs4 import BeautifulSoup
14
- from typing import Dict, Optional, List
13
+
15
14
  from .config import EnrichedCrate
16
15
 
16
+ # Add the project root to the path to ensure utils can be imported
17
+ # This is a common pattern in scripts to handle execution from different directories
18
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
+ if project_root not in sys.path:
20
+ sys.path.insert(0, project_root)
21
+
22
+ try:
23
+ from utils.rust_code_analyzer import RustCodeAnalyzer # type: ignore
24
+ except ImportError as e:
25
+ logging.error(
26
+ f"Failed to import RustCodeAnalyzer: {e}. "
27
+ f"Ensure the utils directory is in the Python path."
28
+ )
29
+ # Provide a non-functional fallback to avoid crashing the entire application
30
+ # if the import fails, but ensure it logs the error.
31
+
32
+ class RustCodeAnalyzer: # type: ignore
33
+ def __init__(self, code_content: str) -> None:
34
+ logging.error(
35
+ "Using fallback RustCodeAnalyzer. Analysis will be incomplete."
36
+ )
37
+ self.code_content = code_content
38
+
39
+ def analyze(self) -> dict[str, Any]:
40
+ return {
41
+ "functions": [],
42
+ "structs": [],
43
+ "enums": [],
44
+ "traits": [],
45
+ "complexity": 0,
46
+ "lines_of_code": len(self.code_content.split("\n")),
47
+ }
48
+
49
+ @staticmethod
50
+ def create_empty_metrics() -> dict[str, Any]:
51
+ return {}
52
+
53
+ @staticmethod
54
+ def detect_project_structure(files: list[str]) -> dict[str, bool]:
55
+ return {}
56
+
57
+ @staticmethod
58
+ def analyze_rust_content(content: str) -> dict[str, Any]:
59
+ return {}
60
+
61
+ @staticmethod
62
+ def aggregate_metrics(
63
+ metrics: dict[str, Any],
64
+ content_analysis: dict[str, Any],
65
+ structure: dict[str, bool],
66
+ ) -> dict[str, Any]:
67
+ return metrics
68
+
69
+
70
+ # Constants for URLs and paths
71
+ CRATES_IO_API_URL = "https://crates.io/api/v1/crates"
72
+ GITHUB_API_URL = "https://api.github.com/repos"
73
+ LIB_RS_URL = "https://lib.rs/crates"
74
+
75
+
17
76
  class SourceAnalyzer:
18
77
  @staticmethod
19
- def analyze_crate_source(crate: EnrichedCrate) -> Dict:
20
- """Orchestrate source analysis from multiple sources"""
21
- crate_name = crate.name
22
- version = crate.version
78
+ def analyze_crate_source(crate: EnrichedCrate) -> dict[str, Any]:
79
+ """Orchestrate source analysis from multiple sources."""
23
80
  repo_url = crate.repository
24
-
81
+
25
82
  # Method 1: Try to download from crates.io
26
83
  try:
27
- url = f"https://crates.io/api/v1/crates/{crate_name}/{version}/download"
28
- response = requests.get(url, stream=True)
29
-
30
- if response.ok:
31
- # We got the tarball, analyze it
32
- return SourceAnalyzer.analyze_crate_tarball(response.content)
33
- except Exception as e:
34
- print(f"Failed to download from crates.io: {str(e)}")
35
-
84
+ url = f"{CRATES_IO_API_URL}/{crate.name}/{crate.version}/download"
85
+ response = requests.get(url, stream=True, timeout=30)
86
+ response.raise_for_status()
87
+ logging.info(f"Successfully downloaded {crate.name} from crates.io")
88
+ return SourceAnalyzer.analyze_crate_tarball(response.content)
89
+ except requests.RequestException as e:
90
+ logging.warning(f"Failed to download from crates.io: {e}")
91
+
36
92
  # Method 2: Try GitHub if we have a GitHub URL
37
- if "github.com" in repo_url:
93
+ if repo_url and "github.com" in repo_url:
94
+ match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
95
+ if match:
96
+ owner, repo_name = match.groups()
97
+ repo_name = repo_name.replace(".git", "")
98
+ try:
99
+ github_url = f"{GITHUB_API_URL}/{owner}/{repo_name}/tarball"
100
+ response = requests.get(github_url, timeout=30)
101
+ response.raise_for_status()
102
+ logging.info(f"Successfully downloaded {crate.name} from GitHub")
103
+ return SourceAnalyzer.analyze_github_tarball(response.content)
104
+ except requests.RequestException as e:
105
+ logging.warning(f"Failed to analyze from GitHub: {e}")
106
+
107
+ # Method 3: Fallback to cloning from the repository directly
108
+ if repo_url:
38
109
  try:
39
- # Extract owner/repo from URL
40
- match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
41
- if match:
42
- owner, repo_name = match.groups()
43
- repo_name = repo_name.split('.')[0] # Remove .git extension
44
-
45
- # Try to download tarball from GitHub
46
- github_url = f"https://api.github.com/repos/{owner}/{repo_name}/tarball"
47
- response = requests.get(github_url)
48
-
49
- if response.ok:
50
- return SourceAnalyzer.analyze_github_tarball(response.content)
110
+ logging.info(f"Attempting to clone repository for {crate.name}")
111
+ return SourceAnalyzer.analyze_crate_source_from_repo(repo_url)
51
112
  except Exception as e:
52
- print(f"Failed to analyze from GitHub: {str(e)}")
53
-
54
- # Method 3: Try lib.rs
55
- try:
56
- # lib.rs doesn't have a direct download API, but redirects to crates.io or GitHub
57
- url = f"https://lib.rs/crates/{crate_name}"
58
- response = requests.get(url)
59
-
60
- if response.ok:
61
- soup = BeautifulSoup(response.text, 'html.parser')
62
-
63
- # Look for repository links
64
- repo_links = soup.select('a[href*="github.com"]')
65
- if repo_links:
66
- repo_url = repo_links[0]['href']
67
-
68
- # We found a GitHub link, now analyze it
69
- return SourceAnalyzer.analyze_crate_source_from_repo(crate_name, version, repo_url)
70
- except Exception as e:
71
- print(f"Failed to analyze from lib.rs: {str(e)}")
72
-
73
- # If we get here, we failed to analyze from any source
113
+ logging.error(f"Failed to clone and analyze repository {repo_url}: {e}")
114
+
74
115
  return {
75
- "error": "Could not analyze crate from any source",
76
- "attempted_sources": ["crates.io", "github", "lib.rs"],
116
+ "error": "Could not analyze crate from any available source.",
117
+ "attempted_sources": ["crates.io", "github", "git_clone"],
77
118
  "file_count": 0,
78
- "loc": 0
119
+ "loc": 0,
79
120
  }
80
121
 
81
122
  @staticmethod
82
- def analyze_crate_tarball(content: bytes) -> Dict:
83
- """Analyze a .crate tarball from crates.io"""
84
- metrics = {
85
- "file_count": 0,
86
- "loc": 0,
87
- "complexity": [],
88
- "types": [],
89
- "traits": [],
90
- "functions": [],
91
- "has_tests": False,
92
- "has_examples": False,
93
- "has_benchmarks": False
94
- }
95
-
123
+ def _analyze_tarball_content(content: bytes) -> dict[str, Any]:
124
+ """Shared logic to analyze tarball content from any source."""
125
+ metrics = RustCodeAnalyzer.create_empty_metrics()
96
126
  try:
97
- # Open the tar file from the content
98
- tar_content = io.BytesIO(content)
99
- with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
100
- # Get list of Rust files
101
- rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
127
+ with io.BytesIO(content) as tar_content, tarfile.open(
128
+ fileobj=tar_content, mode="r:gz"
129
+ ) as tar:
130
+ rust_files = [f for f in tar.getnames() if f.endswith(".rs")]
102
131
  metrics["file_count"] = len(rust_files)
103
-
104
- # Check for test/example/bench directories
105
- all_files = tar.getnames()
106
- metrics["has_tests"] = any('test' in f.lower() for f in all_files)
107
- metrics["has_examples"] = any('example' in f.lower() for f in all_files)
108
- metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
109
-
110
- # Analyze each Rust file
111
- for filename in rust_files:
112
- try:
113
- member = tar.getmember(filename)
114
- if member.isfile():
115
- file_content = tar.extractfile(member)
116
- if file_content:
117
- content_str = file_content.read().decode('utf-8', errors='ignore')
118
-
119
- # Count lines of code
120
- metrics["loc"] += len(content_str.splitlines())
121
-
122
- # Extract code elements
123
- fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
124
- struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
125
- trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
126
-
127
- metrics["functions"].extend(fn_matches)
128
- metrics["types"].extend(struct_matches)
129
- metrics["traits"].extend(trait_matches)
130
- except Exception as e:
131
- print(f"Error analyzing file {filename}: {str(e)}")
132
-
133
- except Exception as e:
134
- metrics["error"] = str(e)
135
-
132
+ structure = RustCodeAnalyzer.detect_project_structure(tar.getnames())
133
+
134
+ for member in tar.getmembers():
135
+ if member.isfile() and member.name.endswith(".rs"):
136
+ file_content = tar.extractfile(member)
137
+ if file_content:
138
+ try:
139
+ content_str = file_content.read().decode("utf-8")
140
+ analysis = RustCodeAnalyzer.analyze_rust_content(
141
+ content_str
142
+ )
143
+ metrics = RustCodeAnalyzer.aggregate_metrics(
144
+ metrics, analysis, structure
145
+ )
146
+ except UnicodeDecodeError:
147
+ logging.warning(
148
+ f"Skipping non-UTF-8 file: {member.name}"
149
+ )
150
+ except tarfile.TarError as e:
151
+ metrics["error"] = f"Failed to read tarball: {e}"
152
+ logging.error(metrics["error"])
136
153
  return metrics
137
154
 
138
155
  @staticmethod
139
- def analyze_github_tarball(content: bytes) -> Dict:
140
- """Analyze a GitHub tarball (which has a different structure)"""
141
- metrics = {
142
- "file_count": 0,
143
- "loc": 0,
144
- "complexity": [],
145
- "types": [],
146
- "traits": [],
147
- "functions": [],
148
- "has_tests": False,
149
- "has_examples": False,
150
- "has_benchmarks": False
151
- }
152
-
153
- try:
154
- # GitHub tarballs are typically gzipped tar files
155
- tar_content = io.BytesIO(content)
156
- with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
157
- # GitHub tarballs include the repo name and commit as the top dir
158
- # So we need to handle the different structure
159
- rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
160
- metrics["file_count"] = len(rust_files)
161
-
162
- # Check for test/example/bench directories
163
- all_files = tar.getnames()
164
- metrics["has_tests"] = any('test' in f.lower() for f in all_files)
165
- metrics["has_examples"] = any('example' in f.lower() for f in all_files)
166
- metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
167
-
168
- # Analyze each Rust file (same as crate tarball)
169
- for filename in rust_files:
170
- try:
171
- member = tar.getmember(filename)
172
- if member.isfile():
173
- file_content = tar.extractfile(member)
174
- if file_content:
175
- content_str = file_content.read().decode('utf-8', errors='ignore')
176
-
177
- # Count lines of code
178
- metrics["loc"] += len(content_str.splitlines())
179
-
180
- # Extract code elements
181
- fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
182
- struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
183
- trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
184
-
185
- metrics["functions"].extend(fn_matches)
186
- metrics["types"].extend(struct_matches)
187
- metrics["traits"].extend(trait_matches)
188
- except Exception as e:
189
- print(f"Error analyzing file {filename}: {str(e)}")
190
-
191
- except Exception as e:
192
- metrics["error"] = str(e)
193
-
194
- return metrics
156
+ def analyze_crate_tarball(content: bytes) -> dict[str, Any]:
157
+ """Analyze a .crate tarball from crates.io."""
158
+ return SourceAnalyzer._analyze_tarball_content(content)
195
159
 
196
160
  @staticmethod
197
- def analyze_local_directory(directory: str) -> Dict:
198
- """Analyze source code from a local directory"""
199
- metrics = {
200
- "file_count": 0,
201
- "loc": 0,
202
- "complexity": [],
203
- "types": [],
204
- "traits": [],
205
- "functions": [],
206
- "has_tests": False,
207
- "has_examples": False,
208
- "has_benchmarks": False
209
- }
210
-
161
+ def analyze_github_tarball(content: bytes) -> dict[str, Any]:
162
+ """Analyze a GitHub tarball."""
163
+ return SourceAnalyzer._analyze_tarball_content(content)
164
+
165
+ @staticmethod
166
+ def analyze_local_directory(directory: str) -> dict[str, Any]:
167
+ """Analyze source code from a local directory."""
168
+ metrics = RustCodeAnalyzer.create_empty_metrics()
211
169
  try:
212
- # Find all Rust files
213
- rust_files = []
214
- for root, _, files in os.walk(directory):
215
- if "target" in root or ".git" in root: # Skip build dirs and git
216
- continue
217
- rust_files.extend([os.path.join(root, f) for f in files if f.endswith(".rs")])
218
-
170
+ rust_files: list[str] = []
171
+ all_paths: list[str] = []
172
+ for root, dirs, files in os.walk(directory):
173
+ # Exclude target and .git directories
174
+ dirs[:] = [d for d in dirs if d not in ["target", ".git"]]
175
+ for file in files:
176
+ full_path = os.path.join(root, file)
177
+ all_paths.append(full_path)
178
+ if file.endswith(".rs"):
179
+ rust_files.append(full_path)
180
+
219
181
  metrics["file_count"] = len(rust_files)
220
-
221
- # Check if the crate has tests/examples/benchmarks
222
- metrics["has_tests"] = any(os.path.exists(os.path.join(directory, d))
223
- for d in ["tests", "test"])
224
- metrics["has_examples"] = os.path.exists(os.path.join(directory, "examples"))
225
- metrics["has_benchmarks"] = os.path.exists(os.path.join(directory, "benches"))
226
-
227
- # Analyze each Rust file
182
+ structure = RustCodeAnalyzer.detect_project_structure(all_paths)
183
+
228
184
  for file_path in rust_files:
229
185
  try:
230
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
186
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
231
187
  content = f.read()
232
-
233
- # Count lines of code
234
- metrics["loc"] += len(content.splitlines())
235
-
236
- # Extract code elements
237
- fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content)
238
- struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content)
239
- trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content)
240
-
241
- metrics["functions"].extend(fn_matches)
242
- metrics["types"].extend(struct_matches)
243
- metrics["traits"].extend(trait_matches)
244
-
188
+ analysis = RustCodeAnalyzer.analyze_rust_content(content)
189
+ metrics = RustCodeAnalyzer.aggregate_metrics(
190
+ metrics, analysis, structure
191
+ )
245
192
  except Exception as e:
246
- print(f"Error analyzing file {file_path}: {str(e)}")
247
-
193
+ logging.warning(f"Error analyzing file {file_path}: {e}")
248
194
  except Exception as e:
249
- metrics["error"] = str(e)
250
-
195
+ metrics["error"] = f"Failed to analyze local directory {directory}: {e}"
196
+ logging.error(metrics["error"])
251
197
  return metrics
252
198
 
253
199
  @staticmethod
254
- def analyze_crate_source_from_repo(crate_name: str, version: str, repo_url: str) -> Dict:
255
- """Clone and analyze a crate's source code from repository"""
256
- temp_dir = f"/tmp/rust_analysis/{crate_name}"
257
- os.makedirs(temp_dir, exist_ok=True)
258
-
259
- try:
260
- # Clone repository
261
- if not os.path.exists(f"{temp_dir}/.git"):
262
- subprocess.run(["git", "clone", "--depth=1", repo_url, temp_dir],
263
- capture_output=True, text=True, check=True)
264
-
265
- return SourceAnalyzer.analyze_local_directory(temp_dir)
266
-
267
- except Exception as e:
268
- return {
269
- "error": f"Failed to clone and analyze repository: {str(e)}",
270
- "file_count": 0,
271
- "loc": 0
272
- }
273
- finally:
274
- # Clean up (optional)
275
- # subprocess.run(["rm", "-rf", temp_dir], capture_output=True)
276
- pass
200
+ def analyze_crate_source_from_repo(repo_url: str) -> dict[str, Any]:
201
+ """Clone and analyze a crate's source code from a repository."""
202
+ with tempfile.TemporaryDirectory() as temp_dir:
203
+ try:
204
+ logging.info(f"Cloning {repo_url} into {temp_dir}")
205
+ subprocess.run(
206
+ ["git", "clone", "--depth=1", repo_url, temp_dir],
207
+ capture_output=True,
208
+ text=True,
209
+ check=True,
210
+ timeout=120,
211
+ )
212
+ return SourceAnalyzer.analyze_local_directory(temp_dir)
213
+ except (
214
+ subprocess.CalledProcessError,
215
+ subprocess.TimeoutExpired,
216
+ ) as e:
217
+ error_output = ""
218
+ if hasattr(e, "stderr") and e.stderr:
219
+ error_output = e.stderr.decode("utf-8", "ignore")
220
+ else:
221
+ error_output = str(e)
222
+ logging.error(f"Failed to clone repository {repo_url}: {error_output}")
223
+ return {
224
+ "error": f"Failed to clone repository: {error_output}",
225
+ "file_count": 0,
226
+ "loc": 0,
227
+ }
228
+
277
229
 
278
230
  class SecurityAnalyzer:
279
231
  @staticmethod
280
- def check_security_metrics(crate: EnrichedCrate) -> Dict:
281
- """Check security metrics for a crate"""
282
- security_data = {
232
+ def check_security_metrics(crate: EnrichedCrate) -> dict[str, Any]:
233
+ """Check security metrics for a crate (placeholder)."""
234
+ security_data: dict[str, Any] = {
283
235
  "advisories": [],
284
236
  "vulnerability_count": 0,
285
237
  "cargo_audit": None,
286
- "clippy_warnings": 0,
287
- "test_coverage": None
238
+ "unsafe_blocks": 0,
288
239
  }
289
-
290
- crate_name = crate.name
291
- version = crate.version
292
-
293
- # Check RustSec Advisory Database
294
- try:
295
- # This would require the RustSec advisory database
296
- # For now, just return placeholder data
297
- advisories_url = f"https://rustsec.org/advisories/{crate_name}.json"
298
- response = requests.get(advisories_url)
299
- if response.ok:
300
- advisories = response.json()
301
- security_data["advisories"] = advisories
302
- security_data["vulnerability_count"] = len(advisories)
303
- except Exception:
304
- pass
305
-
306
- # Check for common security patterns in code
307
- try:
308
- # This would analyze the source code for unsafe blocks, etc.
309
- # Placeholder for now
310
- security_data["unsafe_blocks"] = 0
311
- security_data["security_patterns"] = []
312
- except Exception:
313
- pass
314
-
240
+ # In a real implementation, this would run tools like `cargo-audit`
241
+ # and parse the output. For now, it remains a placeholder.
242
+ logging.info(f"Running placeholder security check for {crate.name}")
315
243
  return security_data
316
244
 
245
+
317
246
  class UserBehaviorAnalyzer:
318
247
  @staticmethod
319
- def fetch_user_behavior_data(crate: EnrichedCrate) -> Dict:
320
- """Fetch user behavior data from GitHub and crates.io"""
321
- result = {
248
+ def _get_github_headers() -> dict[str, str]:
249
+ """Get headers for GitHub API requests, including auth if available."""
250
+ headers = {"Accept": "application/vnd.github.v3+json"}
251
+ if token := os.environ.get("GITHUB_TOKEN"):
252
+ headers["Authorization"] = f"token {token}"
253
+ return headers
254
+
255
+ @staticmethod
256
+ def fetch_user_behavior_data(crate: EnrichedCrate) -> dict[str, Any]:
257
+ """Fetch user behavior data from GitHub and crates.io."""
258
+ result: dict[str, Any] = {
322
259
  "issues": [],
323
260
  "pull_requests": [],
324
261
  "version_adoption": {},
325
- "community_metrics": {}
262
+ "community_metrics": {},
326
263
  }
327
-
328
- crate_name = crate.name
329
264
  repo_url = crate.repository
330
-
331
- # Extract owner/repo from URL
332
265
  if not repo_url or "github.com" not in repo_url:
333
266
  return result
334
-
335
- parts = repo_url.rstrip('/').split('/')
336
- if len(parts) < 2:
267
+
268
+ match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
269
+ if not match:
337
270
  return result
338
- owner, repo = parts[-2], parts[-1]
339
-
340
- # Setup GitHub API access - use token if available
341
- headers = {"Accept": "application/vnd.github.v3+json"}
342
- if os.environ.get("GITHUB_TOKEN"):
343
- headers["Authorization"] = f"token {os.environ.get('GITHUB_TOKEN')}"
344
-
345
- # Fetch recent issues and PRs
346
- try:
347
- # Get issues (last 30)
348
- issues_url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=30"
349
- issues_resp = requests.get(issues_url, headers=headers)
350
- if issues_resp.ok:
351
- issues_data = issues_resp.json()
352
-
353
- # Process issue data
354
- for issue in issues_data:
355
- if "pull_request" in issue:
356
- # This is a PR, not an issue
357
- result["pull_requests"].append({
358
- "number": issue["number"],
359
- "title": issue["title"],
360
- "state": issue["state"],
361
- "created_at": issue["created_at"],
362
- "closed_at": issue["closed_at"],
363
- "url": issue["html_url"]
364
- })
365
- else:
366
- # Regular issue
367
- result["issues"].append({
368
- "number": issue["number"],
369
- "title": issue["title"],
370
- "state": issue["state"],
371
- "created_at": issue["created_at"],
372
- "closed_at": issue["closed_at"],
373
- "url": issue["html_url"]
374
- })
375
-
376
- # Fetch commit activity for the past year
377
- commits_url = f"https://api.github.com/repos/{owner}/{repo}/stats/commit_activity"
378
- commits_resp = requests.get(commits_url, headers=headers)
379
- if commits_resp.ok:
380
- result["community_metrics"]["commit_activity"] = commits_resp.json()
381
-
382
- # Rate limiting - be nice to GitHub API
383
- time.sleep(1)
384
- except Exception as e:
385
- print(f"Error fetching GitHub data: {str(e)}")
386
-
387
- # Get version adoption data from crates.io
271
+ owner, repo = match.groups()
272
+ repo = repo.replace(".git", "")
273
+
274
+ headers = UserBehaviorAnalyzer._get_github_headers()
275
+ UserBehaviorAnalyzer._fetch_github_activity(owner, repo, headers, result)
276
+ UserBehaviorAnalyzer._fetch_crates_io_versions(crate.name, result)
277
+
278
+ return result
279
+
280
+ @staticmethod
281
+ def _fetch_github_activity(
282
+ owner: str, repo: str, headers: dict[str, str], result: dict[str, Any]
283
+ ) -> None:
284
+ """Fetch issues, PRs, and commit activity from GitHub."""
388
285
  try:
389
- versions_url = f"https://crates.io/api/v1/crates/{crate_name}/versions"
390
- versions_resp = requests.get(versions_url)
391
- if versions_resp.ok:
392
- versions_data = versions_resp.json()
393
- versions = versions_data.get("versions", [])
394
-
395
- # Process version data
396
- for version in versions[:10]: # Top 10 versions
397
- version_num = version["num"]
398
- downloads = version["downloads"]
399
- created_at = version["created_at"]
400
-
401
- result["version_adoption"][version_num] = {
402
- "downloads": downloads,
403
- "created_at": created_at
286
+ issues_url = f"{GITHUB_API_URL}/{owner}/{repo}/issues?state=all&per_page=30"
287
+ issues_resp = requests.get(issues_url, headers=headers, timeout=30)
288
+ issues_resp.raise_for_status()
289
+
290
+ for item in issues_resp.json():
291
+ is_pr = "pull_request" in item
292
+ data_list = result["pull_requests"] if is_pr else result["issues"]
293
+ data_list.append(
294
+ {
295
+ "number": item["number"],
296
+ "title": item["title"],
297
+ "state": item["state"],
298
+ "created_at": item["created_at"],
299
+ "closed_at": item["closed_at"],
300
+ "url": item["html_url"],
404
301
  }
405
- except Exception as e:
406
- print(f"Error fetching crates.io version data: {str(e)}")
407
-
408
- return result
302
+ )
303
+
304
+ # Fetch commit activity (retries on 202)
305
+ activity_url = f"{GITHUB_API_URL}/{owner}/{repo}/stats/commit_activity"
306
+ for _ in range(3): # Retry up to 3 times
307
+ activity_resp = requests.get(activity_url, headers=headers, timeout=60)
308
+ if activity_resp.status_code == 200:
309
+ result["community_metrics"][
310
+ "commit_activity"
311
+ ] = activity_resp.json()
312
+ break
313
+ elif activity_resp.status_code == 202:
314
+ logging.info(
315
+ f"GitHub is calculating stats for {owner}/{repo}, waiting..."
316
+ )
317
+ time.sleep(2)
318
+ else:
319
+ activity_resp.raise_for_status()
320
+
321
+ except requests.RequestException as e:
322
+ logging.warning(f"Error fetching GitHub data for {owner}/{repo}: {e}")
323
+
324
+ @staticmethod
325
+ def _fetch_crates_io_versions(crate_name: str, result: dict[str, Any]) -> None:
326
+ """Fetch version adoption data from crates.io."""
327
+ try:
328
+ versions_url = f"{CRATES_IO_API_URL}/{crate_name}/versions"
329
+ versions_resp = requests.get(versions_url, timeout=30)
330
+ versions_resp.raise_for_status()
331
+ versions_data = versions_resp.json().get("versions", [])
332
+
333
+ for version in versions_data[:10]: # Top 10 versions
334
+ result["version_adoption"][version["num"]] = {
335
+ "downloads": version["downloads"],
336
+ "created_at": version["created_at"],
337
+ }
338
+ except requests.RequestException as e:
339
+ logging.warning(
340
+ f"Error fetching crates.io version data for {crate_name}: {e}"
341
+ )
342
+
409
343
 
410
344
  class DependencyAnalyzer:
411
345
  @staticmethod
412
- def analyze_dependencies(crates: List[EnrichedCrate]) -> Dict:
413
- """Analyze dependencies between crates"""
414
- dependency_graph = {}
346
+ def analyze_dependencies(crates: list[EnrichedCrate]) -> dict[str, Any]:
347
+ """Analyze dependencies within a given list of crates."""
415
348
  crate_names = {crate.name for crate in crates}
416
-
417
- for crate in crates:
418
- deps = []
419
- for dep in crate.dependencies:
420
- if dep.get("crate_id") in crate_names:
421
- deps.append(dep.get("crate_id"))
422
- dependency_graph[crate.name] = deps
423
-
424
- # Find most depended-upon crates
425
- reverse_deps = {}
349
+ dependency_graph: dict[str, list[str]] = {
350
+ crate.name: [
351
+ dep_id
352
+ for dep in crate.dependencies
353
+ if (dep_id := dep.get("crate_id")) and dep_id in crate_names
354
+ ]
355
+ for crate in crates
356
+ }
357
+
358
+ reverse_deps: dict[str, list[str]] = {}
426
359
  for crate_name, deps in dependency_graph.items():
427
360
  for dep in deps:
428
- if dep not in reverse_deps:
429
- reverse_deps[dep] = []
430
- reverse_deps[dep].append(crate_name)
431
-
361
+ if dep: # Ensure dep is not None
362
+ reverse_deps.setdefault(dep, []).append(crate_name)
363
+
364
+ most_depended = sorted(
365
+ reverse_deps.items(), key=lambda item: len(item[1]), reverse=True
366
+ )[:10]
367
+
432
368
  return {
433
369
  "dependency_graph": dependency_graph,
434
370
  "reverse_dependencies": reverse_deps,
435
- "most_depended": sorted(reverse_deps.items(), key=lambda x: len(x[1]), reverse=True)[:10]
371
+ "most_depended": most_depended,
436
372
  }