rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +23 -3
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +107 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +47 -2
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
- rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/network.py
CHANGED
@@ -1,40 +1,68 @@
|
|
1
1
|
# network.py
|
2
2
|
import os
|
3
|
+
import sys
|
3
4
|
import re
|
4
5
|
import time
|
5
6
|
import logging
|
6
7
|
import requests
|
7
|
-
from requests_cache import CachedSession
|
8
8
|
from bs4 import BeautifulSoup
|
9
9
|
from typing import Dict, List, Optional
|
10
10
|
from .config import PipelineConfig
|
11
11
|
|
12
|
+
# Import utilities with fallback
|
13
|
+
try:
|
14
|
+
# Add the parent directory to the path to import utils
|
15
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
16
|
+
from utils.http_client_utils import HTTPClientUtils, MetadataExtractor
|
17
|
+
except ImportError:
|
18
|
+
# Fallback implementations for when utils are not available
|
19
|
+
class HTTPClientUtils:
|
20
|
+
def __init__(self):
|
21
|
+
pass
|
22
|
+
|
23
|
+
class MetadataExtractor:
|
24
|
+
def __init__(self):
|
25
|
+
pass
|
26
|
+
|
27
|
+
# Import atomic utilities for code reuse
|
28
|
+
import sys
|
29
|
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
30
|
+
|
31
|
+
|
12
32
|
class GitHubBatchClient:
|
13
33
|
def __init__(self, config: PipelineConfig):
|
14
34
|
self.config = config
|
15
|
-
|
35
|
+
# Simple headers without dependency on HTTPClientUtils
|
36
|
+
self.headers = {
|
37
|
+
"Accept": "application/vnd.github.v3+json",
|
38
|
+
"User-Agent": "SigilDERG-Data-Production/1.0"
|
39
|
+
}
|
16
40
|
if config.github_token:
|
17
41
|
self.headers["Authorization"] = f"token {config.github_token}"
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
)
|
42
|
+
|
43
|
+
# Simple session without dependency on HTTPClientUtils
|
44
|
+
self.session = requests.Session()
|
45
|
+
self.session.headers.update(self.headers)
|
23
46
|
self.remaining_calls = 5000
|
24
47
|
self.reset_time = 0
|
25
48
|
|
26
49
|
def check_rate_limit(self):
|
27
50
|
"""Check and update current rate limit status"""
|
28
51
|
try:
|
29
|
-
response = self.session.get(
|
52
|
+
response = self.session.get(
|
53
|
+
"https://api.github.com/rate_limit",
|
54
|
+
headers=self.headers)
|
30
55
|
if response.ok:
|
31
56
|
data = response.json()
|
32
57
|
self.remaining_calls = data["resources"]["core"]["remaining"]
|
33
58
|
self.reset_time = data["resources"]["core"]["reset"]
|
34
|
-
|
59
|
+
|
35
60
|
if self.remaining_calls < 100:
|
36
61
|
reset_in = self.reset_time - time.time()
|
37
|
-
logging.warning(
|
62
|
+
logging.warning(
|
63
|
+
f"GitHub API rate limit low: {
|
64
|
+
self.remaining_calls} remaining. Resets in {
|
65
|
+
reset_in / 60:.1f} minutes")
|
38
66
|
except Exception:
|
39
67
|
pass
|
40
68
|
|
@@ -46,7 +74,8 @@ class GitHubBatchClient:
|
|
46
74
|
if response.ok:
|
47
75
|
return response.json()
|
48
76
|
else:
|
49
|
-
logging.warning(
|
77
|
+
logging.warning(
|
78
|
+
f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
|
50
79
|
return {}
|
51
80
|
except Exception as e:
|
52
81
|
logging.error(f"Error fetching repo stats: {str(e)}")
|
@@ -55,38 +84,46 @@ class GitHubBatchClient:
|
|
55
84
|
def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
|
56
85
|
"""Get statistics for multiple repositories in a batch"""
|
57
86
|
self.check_rate_limit()
|
58
|
-
|
87
|
+
|
59
88
|
results = {}
|
60
89
|
for repo_url in repo_list:
|
61
90
|
# Extract owner/repo from URL
|
62
91
|
match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
|
63
92
|
if not match:
|
64
93
|
continue
|
65
|
-
|
94
|
+
|
66
95
|
owner, repo = match.groups()
|
67
96
|
repo = repo.split('.')[0] # Remove .git extension if present
|
68
|
-
|
97
|
+
|
69
98
|
# Get stats
|
70
99
|
stats = self.get_repo_stats(owner, repo)
|
71
100
|
results[repo_url] = stats
|
72
|
-
|
101
|
+
|
73
102
|
# Be nice to GitHub API
|
74
103
|
time.sleep(0.1)
|
75
|
-
|
76
104
|
return results
|
77
105
|
|
106
|
+
|
78
107
|
class CrateAPIClient:
|
79
108
|
def __init__(self, config: PipelineConfig):
|
80
109
|
self.config = config
|
81
|
-
|
82
|
-
|
110
|
+
# Simple session without dependency on HTTPClientUtils
|
111
|
+
self.session = requests.Session()
|
112
|
+
self.session.headers.update({
|
113
|
+
"User-Agent": "SigilDERG-Data-Production/1.0"
|
114
|
+
})
|
115
|
+
|
83
116
|
def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
|
84
117
|
"""Fetch metadata with retry logic"""
|
85
118
|
for attempt in range(self.config.max_retries):
|
86
119
|
try:
|
87
120
|
return self._fetch_metadata(crate_name)
|
88
121
|
except Exception as e:
|
89
|
-
logging.warning(
|
122
|
+
logging.warning(
|
123
|
+
f"Attempt {
|
124
|
+
attempt +
|
125
|
+
1} failed for {crate_name}: {
|
126
|
+
str(e)}")
|
90
127
|
wait = 2 ** attempt
|
91
128
|
time.sleep(wait)
|
92
129
|
return None
|
@@ -95,71 +132,77 @@ class CrateAPIClient:
|
|
95
132
|
"""Enhanced metadata fetching that tries multiple sources"""
|
96
133
|
# First try crates.io (primary source)
|
97
134
|
try:
|
98
|
-
r = self.session.get(
|
135
|
+
r = self.session.get(
|
136
|
+
f"https://crates.io/api/v1/crates/{crate_name}")
|
99
137
|
if r.ok:
|
100
138
|
data = r.json()
|
101
139
|
crate_data = data["crate"]
|
102
140
|
latest = crate_data["newest_version"]
|
103
|
-
|
141
|
+
|
104
142
|
# Get readme
|
105
|
-
readme_response = self.session.get(
|
143
|
+
readme_response = self.session.get(
|
144
|
+
f"https://crates.io/api/v1/crates/{crate_name}/readme")
|
106
145
|
readme = readme_response.text if readme_response.ok else ""
|
107
|
-
|
146
|
+
|
108
147
|
# Get dependencies
|
109
|
-
deps_response = self.session.get(
|
148
|
+
deps_response = self.session.get(
|
149
|
+
f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
|
110
150
|
deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
|
111
|
-
|
151
|
+
|
112
152
|
# Get features - using the versions endpoint
|
113
153
|
features = []
|
114
|
-
versions_response = self.session.get(
|
154
|
+
versions_response = self.session.get(
|
155
|
+
f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
|
115
156
|
if versions_response.ok:
|
116
157
|
version_data = versions_response.json().get("version", {})
|
117
158
|
features_dict = version_data.get("features", {})
|
118
|
-
features = [{"name": k, "dependencies": v}
|
119
|
-
|
159
|
+
features = [{"name": k, "dependencies": v}
|
160
|
+
for k, v in features_dict.items()]
|
161
|
+
|
120
162
|
# Repository info and GitHub stars
|
121
163
|
repo = crate_data.get("repository", "")
|
122
164
|
gh_stars = 0
|
123
|
-
|
165
|
+
|
124
166
|
# Check if it's a GitHub repo
|
125
167
|
if "github.com" in repo and self.config.github_token:
|
126
168
|
match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
|
127
169
|
if match:
|
128
170
|
owner, repo_name = match.groups()
|
129
|
-
repo_name = repo_name.split(
|
171
|
+
repo_name = repo_name.split(
|
172
|
+
'.')[0] # Handle .git extensions
|
130
173
|
gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
131
|
-
gh_headers = {
|
174
|
+
gh_headers = {
|
175
|
+
"Authorization": f"token {
|
176
|
+
self.config.github_token}"} if self.config.github_token else {}
|
132
177
|
gh = self.session.get(gh_url, headers=gh_headers)
|
133
178
|
if gh.ok:
|
134
179
|
gh_data = gh.json()
|
135
180
|
gh_stars = gh_data.get("stargazers_count", 0)
|
136
|
-
|
181
|
+
|
137
182
|
# Check if it's hosted on lib.rs
|
138
183
|
lib_rs_data = {}
|
139
184
|
if "lib.rs" in repo:
|
140
185
|
lib_rs_url = f"https://lib.rs/crates/{crate_name}"
|
141
186
|
lib_rs_response = self.session.get(lib_rs_url)
|
142
187
|
if lib_rs_response.ok:
|
143
|
-
soup = BeautifulSoup(
|
188
|
+
soup = BeautifulSoup(
|
189
|
+
lib_rs_response.text, 'html.parser')
|
144
190
|
# Get README from lib.rs if not already available
|
145
191
|
if not readme:
|
146
192
|
readme_div = soup.find('div', class_='readme')
|
147
193
|
if readme_div:
|
148
|
-
readme = readme_div.get_text(strip=True)
|
149
|
-
|
150
|
-
# Get lib.rs specific stats
|
194
|
+
readme = readme_div.get_text(strip=True) # Get lib.rs specific stats
|
151
195
|
stats_div = soup.find('div', class_='crate-stats')
|
152
196
|
if stats_div:
|
153
|
-
downloads_text = stats_div.find(
|
154
|
-
|
155
|
-
lib_rs_data["librs_downloads"] = int(
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
#
|
161
|
-
|
162
|
-
|
197
|
+
downloads_text = stats_div.find(
|
198
|
+
string=re.compile(r'[\d,]+ downloads'))
|
199
|
+
if downloads_text: lib_rs_data["librs_downloads"] = int(
|
200
|
+
re.sub(r'[^\d]', '', str(downloads_text)))
|
201
|
+
|
202
|
+
# Extract code snippets and sections (simplified)
|
203
|
+
code_snippets = [] # Simplified - would normally extract from readme
|
204
|
+
readme_sections = {} # Simplified - would normally parse sections
|
205
|
+
|
163
206
|
result = {
|
164
207
|
"name": crate_name,
|
165
208
|
"version": latest,
|
@@ -176,33 +219,36 @@ class CrateAPIClient:
|
|
176
219
|
"readme_sections": readme_sections,
|
177
220
|
**lib_rs_data
|
178
221
|
}
|
179
|
-
|
222
|
+
|
180
223
|
return result
|
181
|
-
|
224
|
+
|
182
225
|
except Exception as e:
|
183
|
-
logging.error(
|
226
|
+
logging.error(
|
227
|
+
f"Failed fetching metadata for {crate_name}: {
|
228
|
+
str(e)}")
|
184
229
|
raise
|
185
|
-
|
230
|
+
|
186
231
|
# If crates.io fails, try lib.rs
|
187
232
|
try:
|
188
233
|
r = self.session.get(f"https://lib.rs/crates/{crate_name}")
|
189
234
|
if r.ok:
|
190
235
|
soup = BeautifulSoup(r.text, 'html.parser')
|
191
|
-
|
236
|
+
|
192
237
|
# Extract metadata from lib.rs page
|
193
|
-
name = soup.select_one('h1').text.strip(
|
194
|
-
|
238
|
+
name = soup.select_one('h1').text.strip(
|
239
|
+
) if soup.select_one('h1') else crate_name
|
240
|
+
|
195
241
|
# Find description
|
196
242
|
desc_elem = soup.select_one('.description')
|
197
243
|
description = desc_elem.text.strip() if desc_elem else ""
|
198
|
-
|
244
|
+
|
199
245
|
# Find repository link
|
200
246
|
repo_link = None
|
201
247
|
for a in soup.select('a'):
|
202
|
-
if 'github.com' in a.get('
|
248
|
+
if 'github.com' in a.get('hre', ''):
|
203
249
|
repo_link = a['href']
|
204
250
|
break
|
205
|
-
|
251
|
+
|
206
252
|
# Basic metadata from lib.rs
|
207
253
|
return {
|
208
254
|
"name": name,
|
@@ -222,22 +268,23 @@ class CrateAPIClient:
|
|
222
268
|
}
|
223
269
|
except Exception:
|
224
270
|
pass
|
225
|
-
|
271
|
+
|
226
272
|
# Finally, try GitHub search
|
227
273
|
try:
|
228
|
-
# This is a simplification - GitHub's search API requires
|
274
|
+
# This is a simplification - GitHub's search API requires
|
275
|
+
# authentication
|
229
276
|
headers = {}
|
230
277
|
if self.config.github_token:
|
231
278
|
headers["Authorization"] = f"token {self.config.github_token}"
|
232
|
-
|
279
|
+
|
233
280
|
search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
|
234
281
|
r = requests.get(search_url, headers=headers)
|
235
|
-
|
282
|
+
|
236
283
|
if r.ok:
|
237
284
|
results = r.json().get("items", [])
|
238
285
|
if results:
|
239
286
|
repo = results[0] # Take first match
|
240
|
-
|
287
|
+
|
241
288
|
# Basic metadata from GitHub
|
242
289
|
return {
|
243
290
|
"name": crate_name,
|
@@ -249,8 +296,7 @@ class CrateAPIClient:
|
|
249
296
|
"readme": "",
|
250
297
|
"downloads": 0,
|
251
298
|
"github_stars": repo.get("stargazers_count", 0),
|
252
|
-
"dependencies": [],
|
253
|
-
"code_snippets": [],
|
299
|
+
"dependencies": [], "code_snippets": [],
|
254
300
|
"features": [],
|
255
301
|
"readme_sections": {},
|
256
302
|
"source": "github",
|
@@ -260,48 +306,3 @@ class CrateAPIClient:
|
|
260
306
|
|
261
307
|
# If all sources fail
|
262
308
|
return None
|
263
|
-
|
264
|
-
def extract_code_snippets(self, readme: str) -> List[str]:
|
265
|
-
"""Extract code snippets from markdown README"""
|
266
|
-
snippets = []
|
267
|
-
if not readme:
|
268
|
-
return snippets
|
269
|
-
|
270
|
-
# Find Rust code blocks
|
271
|
-
pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
|
272
|
-
matches = re.findall(pattern, readme)
|
273
|
-
|
274
|
-
for code in matches:
|
275
|
-
if len(code.strip()) > 10: # Only include non-trivial snippets
|
276
|
-
snippets.append(code.strip())
|
277
|
-
|
278
|
-
return snippets[:5] # Limit to 5 snippets
|
279
|
-
|
280
|
-
def extract_readme_sections(self, readme: str) -> Dict[str, str]:
|
281
|
-
"""Extract sections from README based on markdown headers"""
|
282
|
-
if not readme:
|
283
|
-
return {}
|
284
|
-
|
285
|
-
sections = {}
|
286
|
-
lines = readme.split('\n')
|
287
|
-
current_section = ""
|
288
|
-
current_content = []
|
289
|
-
|
290
|
-
for line in lines:
|
291
|
-
if re.match(r'^#+\s+', line): # It's a header
|
292
|
-
# Save previous section
|
293
|
-
if current_section and current_content:
|
294
|
-
sections[current_section] = '\n'.join(current_content).strip()
|
295
|
-
|
296
|
-
# Start new section
|
297
|
-
current_section = re.sub(r'^#+\s+', '', line).strip()
|
298
|
-
current_content = []
|
299
|
-
else:
|
300
|
-
if current_section: # Only collect content if we have a section
|
301
|
-
current_content.append(line)
|
302
|
-
|
303
|
-
# Don't forget the last section
|
304
|
-
if current_section and current_content:
|
305
|
-
sections[current_section] = '\n'.join(current_content).strip()
|
306
|
-
|
307
|
-
return sections
|