rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +23 -3
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +107 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +47 -2
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
- rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/analysis.py
CHANGED
@@ -1,19 +1,42 @@
|
|
1
1
|
# analysis.py
|
2
2
|
import os
|
3
|
+
import sys
|
3
4
|
import re
|
4
5
|
import io
|
5
|
-
import json
|
6
6
|
import time
|
7
7
|
import tarfile
|
8
|
-
import tempfile
|
9
8
|
import subprocess
|
10
9
|
import requests
|
11
|
-
from datetime import datetime
|
12
|
-
from dateutil.relativedelta import relativedelta
|
13
10
|
from bs4 import BeautifulSoup
|
14
|
-
|
11
|
+
|
12
|
+
# Import utilities with fallback
|
13
|
+
try:
|
14
|
+
# Add the parent directory to the path to import utils
|
15
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
16
|
+
from utils.rust_code_analyzer import RustCodeAnalyzer
|
17
|
+
except ImportError:
|
18
|
+
# Fallback implementation for when utils are not available
|
19
|
+
class RustCodeAnalyzer:
|
20
|
+
def __init__(self, code_content):
|
21
|
+
self.code_content = code_content
|
22
|
+
|
23
|
+
def analyze(self):
|
24
|
+
return {
|
25
|
+
"functions": [],
|
26
|
+
"structs": [],
|
27
|
+
"enums": [],
|
28
|
+
"traits": [],
|
29
|
+
"complexity": 0,
|
30
|
+
"lines_of_code": len(self.code_content.split('\n'))
|
31
|
+
}
|
32
|
+
from typing import Dict, List
|
15
33
|
from .config import EnrichedCrate
|
16
34
|
|
35
|
+
# Import atomic utilities for code reuse
|
36
|
+
import sys
|
37
|
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
38
|
+
|
39
|
+
|
17
40
|
class SourceAnalyzer:
|
18
41
|
@staticmethod
|
19
42
|
def analyze_crate_source(crate: EnrichedCrate) -> Dict:
|
@@ -21,18 +44,18 @@ class SourceAnalyzer:
|
|
21
44
|
crate_name = crate.name
|
22
45
|
version = crate.version
|
23
46
|
repo_url = crate.repository
|
24
|
-
|
47
|
+
|
25
48
|
# Method 1: Try to download from crates.io
|
26
49
|
try:
|
27
50
|
url = f"https://crates.io/api/v1/crates/{crate_name}/{version}/download"
|
28
51
|
response = requests.get(url, stream=True)
|
29
|
-
|
52
|
+
|
30
53
|
if response.ok:
|
31
54
|
# We got the tarball, analyze it
|
32
55
|
return SourceAnalyzer.analyze_crate_tarball(response.content)
|
33
56
|
except Exception as e:
|
34
57
|
print(f"Failed to download from crates.io: {str(e)}")
|
35
|
-
|
58
|
+
|
36
59
|
# Method 2: Try GitHub if we have a GitHub URL
|
37
60
|
if "github.com" in repo_url:
|
38
61
|
try:
|
@@ -40,59 +63,52 @@ class SourceAnalyzer:
|
|
40
63
|
match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
|
41
64
|
if match:
|
42
65
|
owner, repo_name = match.groups()
|
43
|
-
repo_name = repo_name.split(
|
44
|
-
|
66
|
+
repo_name = repo_name.split(
|
67
|
+
'.')[0] # Remove .git extension
|
68
|
+
|
45
69
|
# Try to download tarball from GitHub
|
46
70
|
github_url = f"https://api.github.com/repos/{owner}/{repo_name}/tarball"
|
47
71
|
response = requests.get(github_url)
|
48
|
-
|
72
|
+
|
49
73
|
if response.ok:
|
50
|
-
return SourceAnalyzer.analyze_github_tarball(
|
74
|
+
return SourceAnalyzer.analyze_github_tarball(
|
75
|
+
response.content)
|
51
76
|
except Exception as e:
|
52
77
|
print(f"Failed to analyze from GitHub: {str(e)}")
|
53
|
-
|
78
|
+
|
54
79
|
# Method 3: Try lib.rs
|
55
80
|
try:
|
56
|
-
# lib.rs doesn't have a direct download API, but redirects to crates.io or
|
81
|
+
# lib.rs doesn't have a direct download API, but redirects to crates.io or
|
82
|
+
# GitHub
|
57
83
|
url = f"https://lib.rs/crates/{crate_name}"
|
58
84
|
response = requests.get(url)
|
59
|
-
|
85
|
+
|
60
86
|
if response.ok:
|
61
87
|
soup = BeautifulSoup(response.text, 'html.parser')
|
62
|
-
|
88
|
+
|
63
89
|
# Look for repository links
|
64
90
|
repo_links = soup.select('a[href*="github.com"]')
|
65
91
|
if repo_links:
|
66
92
|
repo_url = repo_links[0]['href']
|
67
|
-
|
93
|
+
|
68
94
|
# We found a GitHub link, now analyze it
|
69
|
-
return SourceAnalyzer.analyze_crate_source_from_repo(
|
95
|
+
return SourceAnalyzer.analyze_crate_source_from_repo(
|
96
|
+
crate_name, version, repo_url)
|
70
97
|
except Exception as e:
|
71
98
|
print(f"Failed to analyze from lib.rs: {str(e)}")
|
72
|
-
|
99
|
+
|
73
100
|
# If we get here, we failed to analyze from any source
|
74
101
|
return {
|
75
102
|
"error": "Could not analyze crate from any source",
|
76
103
|
"attempted_sources": ["crates.io", "github", "lib.rs"],
|
77
104
|
"file_count": 0,
|
78
105
|
"loc": 0
|
79
|
-
}
|
106
|
+
} @ staticmethod
|
80
107
|
|
81
|
-
@staticmethod
|
82
108
|
def analyze_crate_tarball(content: bytes) -> Dict:
|
83
|
-
"""Analyze a .crate tarball from crates.io"""
|
84
|
-
metrics =
|
85
|
-
|
86
|
-
"loc": 0,
|
87
|
-
"complexity": [],
|
88
|
-
"types": [],
|
89
|
-
"traits": [],
|
90
|
-
"functions": [],
|
91
|
-
"has_tests": False,
|
92
|
-
"has_examples": False,
|
93
|
-
"has_benchmarks": False
|
94
|
-
}
|
95
|
-
|
109
|
+
"""Analyze a .crate tarball from crates.io - refactored to use atomic utilities"""
|
110
|
+
metrics = RustCodeAnalyzer.create_empty_metrics()
|
111
|
+
|
96
112
|
try:
|
97
113
|
# Open the tar file from the content
|
98
114
|
tar_content = io.BytesIO(content)
|
@@ -100,14 +116,13 @@ class SourceAnalyzer:
|
|
100
116
|
# Get list of Rust files
|
101
117
|
rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
|
102
118
|
metrics["file_count"] = len(rust_files)
|
103
|
-
|
104
|
-
# Check for test/example/bench directories
|
119
|
+
|
120
|
+
# Check for test/example/bench directories using atomic utility
|
105
121
|
all_files = tar.getnames()
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
# Analyze each Rust file
|
122
|
+
structure = RustCodeAnalyzer.detect_project_structure(
|
123
|
+
all_files)
|
124
|
+
|
125
|
+
# Analyze each Rust file using atomic utility
|
111
126
|
for filename in rust_files:
|
112
127
|
try:
|
113
128
|
member = tar.getmember(filename)
|
@@ -115,41 +130,25 @@ class SourceAnalyzer:
|
|
115
130
|
file_content = tar.extractfile(member)
|
116
131
|
if file_content:
|
117
132
|
content_str = file_content.read().decode('utf-8', errors='ignore')
|
118
|
-
|
119
|
-
#
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
|
126
|
-
|
127
|
-
metrics["functions"].extend(fn_matches)
|
128
|
-
metrics["types"].extend(struct_matches)
|
129
|
-
metrics["traits"].extend(trait_matches)
|
133
|
+
|
134
|
+
# Use atomic content analysis
|
135
|
+
content_analysis = RustCodeAnalyzer.analyze_rust_content(
|
136
|
+
content_str)
|
137
|
+
metrics = RustCodeAnalyzer.aggregate_metrics(
|
138
|
+
metrics, content_analysis, structure)
|
139
|
+
|
130
140
|
except Exception as e:
|
131
141
|
print(f"Error analyzing file {filename}: {str(e)}")
|
132
|
-
|
142
|
+
|
133
143
|
except Exception as e:
|
134
144
|
metrics["error"] = str(e)
|
135
|
-
|
136
|
-
return metrics
|
137
145
|
|
138
|
-
@staticmethod
|
146
|
+
return metrics @ staticmethod
|
147
|
+
|
139
148
|
def analyze_github_tarball(content: bytes) -> Dict:
|
140
|
-
"""Analyze a GitHub tarball
|
141
|
-
metrics =
|
142
|
-
|
143
|
-
"loc": 0,
|
144
|
-
"complexity": [],
|
145
|
-
"types": [],
|
146
|
-
"traits": [],
|
147
|
-
"functions": [],
|
148
|
-
"has_tests": False,
|
149
|
-
"has_examples": False,
|
150
|
-
"has_benchmarks": False
|
151
|
-
}
|
152
|
-
|
149
|
+
"""Analyze a GitHub tarball - refactored to use atomic utilities"""
|
150
|
+
metrics = RustCodeAnalyzer.create_empty_metrics()
|
151
|
+
|
153
152
|
try:
|
154
153
|
# GitHub tarballs are typically gzipped tar files
|
155
154
|
tar_content = io.BytesIO(content)
|
@@ -158,14 +157,14 @@ class SourceAnalyzer:
|
|
158
157
|
# So we need to handle the different structure
|
159
158
|
rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
|
160
159
|
metrics["file_count"] = len(rust_files)
|
161
|
-
|
162
|
-
# Check for test/example/bench directories
|
160
|
+
|
161
|
+
# Check for test/example/bench directories using atomic utility
|
163
162
|
all_files = tar.getnames()
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
#
|
163
|
+
structure = RustCodeAnalyzer.detect_project_structure(
|
164
|
+
all_files)
|
165
|
+
|
166
|
+
# Analyze each Rust file using atomic utility (same as crate
|
167
|
+
# tarball)
|
169
168
|
for filename in rust_files:
|
170
169
|
try:
|
171
170
|
member = tar.getmember(filename)
|
@@ -173,97 +172,101 @@ class SourceAnalyzer:
|
|
173
172
|
file_content = tar.extractfile(member)
|
174
173
|
if file_content:
|
175
174
|
content_str = file_content.read().decode('utf-8', errors='ignore')
|
176
|
-
|
177
|
-
#
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
|
184
|
-
|
185
|
-
metrics["functions"].extend(fn_matches)
|
186
|
-
metrics["types"].extend(struct_matches)
|
187
|
-
metrics["traits"].extend(trait_matches)
|
175
|
+
|
176
|
+
# Use atomic content analysis
|
177
|
+
content_analysis = RustCodeAnalyzer.analyze_rust_content(
|
178
|
+
content_str)
|
179
|
+
metrics = RustCodeAnalyzer.aggregate_metrics(
|
180
|
+
metrics, content_analysis, structure)
|
181
|
+
|
188
182
|
except Exception as e:
|
189
183
|
print(f"Error analyzing file {filename}: {str(e)}")
|
190
|
-
|
184
|
+
|
191
185
|
except Exception as e:
|
192
186
|
metrics["error"] = str(e)
|
193
|
-
|
194
|
-
return metrics
|
195
187
|
|
196
|
-
@staticmethod
|
188
|
+
return metrics @ staticmethod
|
189
|
+
|
197
190
|
def analyze_local_directory(directory: str) -> Dict:
|
198
|
-
"""Analyze source code from a local directory"""
|
199
|
-
metrics =
|
200
|
-
|
201
|
-
"loc": 0,
|
202
|
-
"complexity": [],
|
203
|
-
"types": [],
|
204
|
-
"traits": [],
|
205
|
-
"functions": [],
|
206
|
-
"has_tests": False,
|
207
|
-
"has_examples": False,
|
208
|
-
"has_benchmarks": False
|
209
|
-
}
|
210
|
-
|
191
|
+
"""Analyze source code from a local directory - refactored to use atomic utilities"""
|
192
|
+
metrics = RustCodeAnalyzer.create_empty_metrics()
|
193
|
+
|
211
194
|
try:
|
212
195
|
# Find all Rust files
|
213
196
|
rust_files = []
|
214
197
|
for root, _, files in os.walk(directory):
|
215
198
|
if "target" in root or ".git" in root: # Skip build dirs and git
|
216
199
|
continue
|
217
|
-
rust_files.extend([os.path.join(root, f)
|
218
|
-
|
200
|
+
rust_files.extend([os.path.join(root, f)
|
201
|
+
for f in files if f.endswith(".rs")])
|
202
|
+
|
219
203
|
metrics["file_count"] = len(rust_files)
|
220
|
-
|
221
|
-
# Check if the crate has tests/examples/benchmarks
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
204
|
+
|
205
|
+
# Check if the crate has tests/examples/benchmarks using atomic
|
206
|
+
# utility
|
207
|
+
project_dirs = [
|
208
|
+
d for d in os.listdir(directory) if os.path.isdir(
|
209
|
+
os.path.join(
|
210
|
+
directory, d))]
|
211
|
+
structure = RustCodeAnalyzer.detect_project_structure(
|
212
|
+
project_dirs + ["tests", "examples", "benches"])
|
213
|
+
|
214
|
+
# Override with actual directory checks
|
215
|
+
structure["has_tests"] = any(
|
216
|
+
os.path.exists(
|
217
|
+
os.path.join(
|
218
|
+
directory,
|
219
|
+
d)) for d in [
|
220
|
+
"tests",
|
221
|
+
"test"])
|
222
|
+
structure["has_examples"] = os.path.exists(
|
223
|
+
os.path.join(directory, "examples"))
|
224
|
+
structure["has_benchmarks"] = os.path.exists(
|
225
|
+
os.path.join(directory, "benches"))
|
226
|
+
|
227
|
+
# Analyze each Rust file using atomic utility
|
228
228
|
for file_path in rust_files:
|
229
229
|
try:
|
230
230
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
231
231
|
content = f.read()
|
232
|
-
|
233
|
-
#
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content)
|
240
|
-
|
241
|
-
metrics["functions"].extend(fn_matches)
|
242
|
-
metrics["types"].extend(struct_matches)
|
243
|
-
metrics["traits"].extend(trait_matches)
|
244
|
-
|
232
|
+
|
233
|
+
# Use atomic content analysis
|
234
|
+
content_analysis = RustCodeAnalyzer.analyze_rust_content(
|
235
|
+
content)
|
236
|
+
metrics = RustCodeAnalyzer.aggregate_metrics(
|
237
|
+
metrics, content_analysis, structure)
|
238
|
+
|
245
239
|
except Exception as e:
|
246
240
|
print(f"Error analyzing file {file_path}: {str(e)}")
|
247
|
-
|
241
|
+
|
248
242
|
except Exception as e:
|
249
243
|
metrics["error"] = str(e)
|
250
|
-
|
244
|
+
|
251
245
|
return metrics
|
252
246
|
|
253
247
|
@staticmethod
|
254
|
-
def analyze_crate_source_from_repo(
|
248
|
+
def analyze_crate_source_from_repo(
|
249
|
+
crate_name: str,
|
250
|
+
version: str,
|
251
|
+
repo_url: str) -> Dict:
|
255
252
|
"""Clone and analyze a crate's source code from repository"""
|
256
253
|
temp_dir = f"/tmp/rust_analysis/{crate_name}"
|
257
254
|
os.makedirs(temp_dir, exist_ok=True)
|
258
|
-
|
255
|
+
|
259
256
|
try:
|
260
257
|
# Clone repository
|
261
258
|
if not os.path.exists(f"{temp_dir}/.git"):
|
262
|
-
subprocess.run(["git",
|
263
|
-
|
264
|
-
|
259
|
+
subprocess.run(["git",
|
260
|
+
"clone",
|
261
|
+
"--depth=1",
|
262
|
+
repo_url,
|
263
|
+
temp_dir],
|
264
|
+
capture_output=True,
|
265
|
+
text=True,
|
266
|
+
check=True)
|
267
|
+
|
265
268
|
return SourceAnalyzer.analyze_local_directory(temp_dir)
|
266
|
-
|
269
|
+
|
267
270
|
except Exception as e:
|
268
271
|
return {
|
269
272
|
"error": f"Failed to clone and analyze repository: {str(e)}",
|
@@ -272,9 +275,10 @@ class SourceAnalyzer:
|
|
272
275
|
}
|
273
276
|
finally:
|
274
277
|
# Clean up (optional)
|
275
|
-
# subprocess.run(["rm", "-
|
278
|
+
# subprocess.run(["rm", "-r", temp_dir], capture_output=True)
|
276
279
|
pass
|
277
280
|
|
281
|
+
|
278
282
|
class SecurityAnalyzer:
|
279
283
|
@staticmethod
|
280
284
|
def check_security_metrics(crate: EnrichedCrate) -> Dict:
|
@@ -286,10 +290,10 @@ class SecurityAnalyzer:
|
|
286
290
|
"clippy_warnings": 0,
|
287
291
|
"test_coverage": None
|
288
292
|
}
|
289
|
-
|
293
|
+
|
290
294
|
crate_name = crate.name
|
291
295
|
version = crate.version
|
292
|
-
|
296
|
+
|
293
297
|
# Check RustSec Advisory Database
|
294
298
|
try:
|
295
299
|
# This would require the RustSec advisory database
|
@@ -302,7 +306,7 @@ class SecurityAnalyzer:
|
|
302
306
|
security_data["vulnerability_count"] = len(advisories)
|
303
307
|
except Exception:
|
304
308
|
pass
|
305
|
-
|
309
|
+
|
306
310
|
# Check for common security patterns in code
|
307
311
|
try:
|
308
312
|
# This would analyze the source code for unsafe blocks, etc.
|
@@ -311,9 +315,10 @@ class SecurityAnalyzer:
|
|
311
315
|
security_data["security_patterns"] = []
|
312
316
|
except Exception:
|
313
317
|
pass
|
314
|
-
|
318
|
+
|
315
319
|
return security_data
|
316
320
|
|
321
|
+
|
317
322
|
class UserBehaviorAnalyzer:
|
318
323
|
@staticmethod
|
319
324
|
def fetch_user_behavior_data(crate: EnrichedCrate) -> Dict:
|
@@ -324,24 +329,25 @@ class UserBehaviorAnalyzer:
|
|
324
329
|
"version_adoption": {},
|
325
330
|
"community_metrics": {}
|
326
331
|
}
|
327
|
-
|
332
|
+
|
328
333
|
crate_name = crate.name
|
329
334
|
repo_url = crate.repository
|
330
|
-
|
335
|
+
|
331
336
|
# Extract owner/repo from URL
|
332
337
|
if not repo_url or "github.com" not in repo_url:
|
333
338
|
return result
|
334
|
-
|
339
|
+
|
335
340
|
parts = repo_url.rstrip('/').split('/')
|
336
341
|
if len(parts) < 2:
|
337
342
|
return result
|
338
343
|
owner, repo = parts[-2], parts[-1]
|
339
|
-
|
344
|
+
|
340
345
|
# Setup GitHub API access - use token if available
|
341
346
|
headers = {"Accept": "application/vnd.github.v3+json"}
|
342
347
|
if os.environ.get("GITHUB_TOKEN"):
|
343
|
-
headers["Authorization"] = f"token {
|
344
|
-
|
348
|
+
headers["Authorization"] = f"token {
|
349
|
+
os.environ.get('GITHUB_TOKEN')}"
|
350
|
+
|
345
351
|
# Fetch recent issues and PRs
|
346
352
|
try:
|
347
353
|
# Get issues (last 30)
|
@@ -349,7 +355,7 @@ class UserBehaviorAnalyzer:
|
|
349
355
|
issues_resp = requests.get(issues_url, headers=headers)
|
350
356
|
if issues_resp.ok:
|
351
357
|
issues_data = issues_resp.json()
|
352
|
-
|
358
|
+
|
353
359
|
# Process issue data
|
354
360
|
for issue in issues_data:
|
355
361
|
if "pull_request" in issue:
|
@@ -372,18 +378,18 @@ class UserBehaviorAnalyzer:
|
|
372
378
|
"closed_at": issue["closed_at"],
|
373
379
|
"url": issue["html_url"]
|
374
380
|
})
|
375
|
-
|
381
|
+
|
376
382
|
# Fetch commit activity for the past year
|
377
383
|
commits_url = f"https://api.github.com/repos/{owner}/{repo}/stats/commit_activity"
|
378
384
|
commits_resp = requests.get(commits_url, headers=headers)
|
379
385
|
if commits_resp.ok:
|
380
386
|
result["community_metrics"]["commit_activity"] = commits_resp.json()
|
381
|
-
|
387
|
+
|
382
388
|
# Rate limiting - be nice to GitHub API
|
383
389
|
time.sleep(1)
|
384
390
|
except Exception as e:
|
385
391
|
print(f"Error fetching GitHub data: {str(e)}")
|
386
|
-
|
392
|
+
|
387
393
|
# Get version adoption data from crates.io
|
388
394
|
try:
|
389
395
|
versions_url = f"https://crates.io/api/v1/crates/{crate_name}/versions"
|
@@ -391,36 +397,37 @@ class UserBehaviorAnalyzer:
|
|
391
397
|
if versions_resp.ok:
|
392
398
|
versions_data = versions_resp.json()
|
393
399
|
versions = versions_data.get("versions", [])
|
394
|
-
|
400
|
+
|
395
401
|
# Process version data
|
396
402
|
for version in versions[:10]: # Top 10 versions
|
397
403
|
version_num = version["num"]
|
398
404
|
downloads = version["downloads"]
|
399
405
|
created_at = version["created_at"]
|
400
|
-
|
406
|
+
|
401
407
|
result["version_adoption"][version_num] = {
|
402
408
|
"downloads": downloads,
|
403
409
|
"created_at": created_at
|
404
410
|
}
|
405
411
|
except Exception as e:
|
406
412
|
print(f"Error fetching crates.io version data: {str(e)}")
|
407
|
-
|
413
|
+
|
408
414
|
return result
|
409
415
|
|
416
|
+
|
410
417
|
class DependencyAnalyzer:
|
411
418
|
@staticmethod
|
412
419
|
def analyze_dependencies(crates: List[EnrichedCrate]) -> Dict:
|
413
420
|
"""Analyze dependencies between crates"""
|
414
421
|
dependency_graph = {}
|
415
422
|
crate_names = {crate.name for crate in crates}
|
416
|
-
|
423
|
+
|
417
424
|
for crate in crates:
|
418
425
|
deps = []
|
419
426
|
for dep in crate.dependencies:
|
420
427
|
if dep.get("crate_id") in crate_names:
|
421
428
|
deps.append(dep.get("crate_id"))
|
422
429
|
dependency_graph[crate.name] = deps
|
423
|
-
|
430
|
+
|
424
431
|
# Find most depended-upon crates
|
425
432
|
reverse_deps = {}
|
426
433
|
for crate_name, deps in dependency_graph.items():
|
@@ -428,9 +435,13 @@ class DependencyAnalyzer:
|
|
428
435
|
if dep not in reverse_deps:
|
429
436
|
reverse_deps[dep] = []
|
430
437
|
reverse_deps[dep].append(crate_name)
|
431
|
-
|
438
|
+
|
432
439
|
return {
|
433
440
|
"dependency_graph": dependency_graph,
|
434
441
|
"reverse_dependencies": reverse_deps,
|
435
|
-
"most_depended": sorted(
|
436
|
-
|
442
|
+
"most_depended": sorted(
|
443
|
+
reverse_deps.items(),
|
444
|
+
key=lambda x: len(
|
445
|
+
x[1]),
|
446
|
+
reverse=True)[
|
447
|
+
:10]}
|
rust_crate_pipeline/config.py
CHANGED
@@ -1,11 +1,21 @@
|
|
1
1
|
# config.py
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from dataclasses import dataclass, field
|
4
5
|
from typing import Optional, Dict, Any, List
|
5
6
|
|
7
|
+
# Filter Pydantic deprecation warnings from dependencies
|
8
|
+
# Rule Zero Compliance: Suppress third-party warnings while maintaining awareness
|
9
|
+
warnings.filterwarnings("ignore",
|
10
|
+
message=".*Support for class-based `config` is deprecated.*",
|
11
|
+
category=DeprecationWarning,
|
12
|
+
module="pydantic._internal._config")
|
13
|
+
|
14
|
+
|
6
15
|
@dataclass
|
7
16
|
class PipelineConfig:
|
8
|
-
model_path: str = os.path.expanduser(
|
17
|
+
model_path: str = os.path.expanduser(
|
18
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
|
9
19
|
max_tokens: int = 256
|
10
20
|
model_token_limit: int = 4096
|
11
21
|
prompt_token_margin: int = 3000
|
@@ -14,7 +24,12 @@ class PipelineConfig:
|
|
14
24
|
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
15
25
|
cache_ttl: int = 3600 # 1 hour
|
16
26
|
batch_size: int = 10
|
17
|
-
n_workers: int = 4
|
27
|
+
n_workers: int = 4 # Enhanced scraping configuration
|
28
|
+
enable_crawl4ai: bool = True
|
29
|
+
crawl4ai_model: str = os.path.expanduser(
|
30
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
|
31
|
+
crawl4ai_timeout: int = 30
|
32
|
+
|
18
33
|
|
19
34
|
@dataclass
|
20
35
|
class CrateMetadata:
|
@@ -33,6 +48,11 @@ class CrateMetadata:
|
|
33
48
|
readme_sections: Dict[str, str] = field(default_factory=dict)
|
34
49
|
librs_downloads: Optional[int] = None
|
35
50
|
source: str = "crates.io"
|
51
|
+
# Enhanced scraping fields
|
52
|
+
enhanced_scraping: Dict[str, Any] = field(default_factory=dict)
|
53
|
+
enhanced_features: List[str] = field(default_factory=list)
|
54
|
+
enhanced_dependencies: List[str] = field(default_factory=list)
|
55
|
+
|
36
56
|
|
37
57
|
@dataclass
|
38
58
|
class EnrichedCrate(CrateMetadata):
|
@@ -43,4 +63,4 @@ class EnrichedCrate(CrateMetadata):
|
|
43
63
|
factual_counterfactual: Optional[str] = None
|
44
64
|
source_analysis: Optional[Dict[str, Any]] = None
|
45
65
|
user_behavior: Optional[Dict[str, Any]] = None
|
46
|
-
security: Optional[Dict[str, Any]] = None
|
66
|
+
security: Optional[Dict[str, Any]] = None
|