rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +25 -25
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +309 -200
- rust_crate_pipeline/analysis.py +304 -368
- rust_crate_pipeline/azure_ai_processing.py +453 -0
- rust_crate_pipeline/config.py +57 -19
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +42 -36
- rust_crate_pipeline/main.py +386 -102
- rust_crate_pipeline/network.py +153 -133
- rust_crate_pipeline/pipeline.py +340 -264
- rust_crate_pipeline/production_config.py +35 -32
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +45 -14
- rust_crate_pipeline/utils/logging_utils.py +34 -17
- rust_crate_pipeline/version.py +47 -2
- rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
- rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
- rust_crate_pipeline-1.2.5.dist-info/METADATA +0 -573
- rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/network.py
CHANGED
@@ -1,44 +1,57 @@
|
|
1
1
|
# network.py
|
2
2
|
import os
|
3
|
+
import sys
|
3
4
|
import re
|
4
5
|
import time
|
5
6
|
import logging
|
6
7
|
import requests
|
7
|
-
from
|
8
|
-
from
|
9
|
-
from typing import Dict, List, Optional
|
8
|
+
from bs4 import BeautifulSoup, Tag
|
9
|
+
from typing import Any, Union
|
10
10
|
from .config import PipelineConfig
|
11
11
|
|
12
|
+
# Import utilities
|
13
|
+
# Add the parent directory to the path to import utils
|
14
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
15
|
+
|
16
|
+
|
12
17
|
class GitHubBatchClient:
|
13
|
-
def __init__(self, config: PipelineConfig):
|
18
|
+
def __init__(self, config: PipelineConfig) -> None:
|
14
19
|
self.config = config
|
15
|
-
|
20
|
+
# Simple headers without dependency on HTTPClientUtils
|
21
|
+
self.headers = {
|
22
|
+
"Accept": "application/vnd.github.v3+json",
|
23
|
+
"User-Agent": "SigilDERG-Data-Production/1.0",
|
24
|
+
}
|
16
25
|
if config.github_token:
|
17
26
|
self.headers["Authorization"] = f"token {config.github_token}"
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
)
|
27
|
+
|
28
|
+
# Simple session without dependency on HTTPClientUtils
|
29
|
+
self.session = requests.Session()
|
30
|
+
self.session.headers.update(self.headers)
|
23
31
|
self.remaining_calls = 5000
|
24
32
|
self.reset_time = 0
|
25
33
|
|
26
|
-
def check_rate_limit(self):
|
34
|
+
def check_rate_limit(self) -> None:
|
27
35
|
"""Check and update current rate limit status"""
|
28
36
|
try:
|
29
|
-
response = self.session.get(
|
37
|
+
response = self.session.get(
|
38
|
+
"https://api.github.com/rate_limit", headers=self.headers
|
39
|
+
)
|
30
40
|
if response.ok:
|
31
41
|
data = response.json()
|
32
42
|
self.remaining_calls = data["resources"]["core"]["remaining"]
|
33
43
|
self.reset_time = data["resources"]["core"]["reset"]
|
34
|
-
|
44
|
+
|
35
45
|
if self.remaining_calls < 100:
|
36
46
|
reset_in = self.reset_time - time.time()
|
37
|
-
logging.warning(
|
47
|
+
logging.warning(
|
48
|
+
f"GitHub API rate limit low: {self.remaining_calls} remaining. "
|
49
|
+
f"Resets in {reset_in / 60:.1f} minutes"
|
50
|
+
)
|
38
51
|
except Exception:
|
39
52
|
pass
|
40
53
|
|
41
|
-
def get_repo_stats(self, owner: str, repo: str) ->
|
54
|
+
def get_repo_stats(self, owner: str, repo: str) -> dict[str, Any]:
|
42
55
|
"""Get repository statistics"""
|
43
56
|
try:
|
44
57
|
url = f"https://api.github.com/repos/{owner}/{repo}"
|
@@ -46,52 +59,62 @@ class GitHubBatchClient:
|
|
46
59
|
if response.ok:
|
47
60
|
return response.json()
|
48
61
|
else:
|
49
|
-
logging.warning(
|
62
|
+
logging.warning(
|
63
|
+
f"Failed to get repo stats for {owner}/{repo}: "
|
64
|
+
f"{response.status_code}"
|
65
|
+
)
|
50
66
|
return {}
|
51
67
|
except Exception as e:
|
52
68
|
logging.error(f"Error fetching repo stats: {str(e)}")
|
53
69
|
return {}
|
54
70
|
|
55
|
-
def batch_get_repo_stats(self, repo_list:
|
71
|
+
def batch_get_repo_stats(self, repo_list: list[str]) -> dict[str, dict[str, Any]]:
|
56
72
|
"""Get statistics for multiple repositories in a batch"""
|
57
73
|
self.check_rate_limit()
|
58
|
-
|
59
|
-
results = {}
|
74
|
+
|
75
|
+
results: dict[str, dict[str, Any]] = {}
|
60
76
|
for repo_url in repo_list:
|
61
77
|
# Extract owner/repo from URL
|
62
78
|
match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
|
63
79
|
if not match:
|
64
80
|
continue
|
65
|
-
|
81
|
+
|
66
82
|
owner, repo = match.groups()
|
67
|
-
repo = repo.split(
|
68
|
-
|
83
|
+
repo = repo.split(".")[0] # Remove .git extension if present
|
84
|
+
|
69
85
|
# Get stats
|
70
86
|
stats = self.get_repo_stats(owner, repo)
|
71
87
|
results[repo_url] = stats
|
72
|
-
|
88
|
+
|
73
89
|
# Be nice to GitHub API
|
74
90
|
time.sleep(0.1)
|
75
|
-
|
76
91
|
return results
|
77
92
|
|
93
|
+
|
78
94
|
class CrateAPIClient:
|
79
|
-
def __init__(self, config: PipelineConfig):
|
95
|
+
def __init__(self, config: PipelineConfig) -> None:
|
80
96
|
self.config = config
|
81
|
-
|
82
|
-
|
83
|
-
|
97
|
+
# Simple session without dependency on HTTPClientUtils
|
98
|
+
self.session = requests.Session()
|
99
|
+
self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.0"})
|
100
|
+
|
101
|
+
def fetch_crate_metadata(self, crate_name: str) -> dict[str, Any] | None:
|
84
102
|
"""Fetch metadata with retry logic"""
|
85
103
|
for attempt in range(self.config.max_retries):
|
86
104
|
try:
|
87
105
|
return self._fetch_metadata(crate_name)
|
88
106
|
except Exception as e:
|
89
|
-
logging.warning(
|
90
|
-
|
107
|
+
logging.warning(
|
108
|
+
f"Attempt {
|
109
|
+
attempt +
|
110
|
+
1} failed for {crate_name}: {
|
111
|
+
str(e)}"
|
112
|
+
)
|
113
|
+
wait = 2**attempt
|
91
114
|
time.sleep(wait)
|
92
115
|
return None
|
93
116
|
|
94
|
-
def _fetch_metadata(self, crate_name: str) ->
|
117
|
+
def _fetch_metadata(self, crate_name: str) -> dict[str, Any] | None:
|
95
118
|
"""Enhanced metadata fetching that tries multiple sources"""
|
96
119
|
# First try crates.io (primary source)
|
97
120
|
try:
|
@@ -100,67 +123,92 @@ class CrateAPIClient:
|
|
100
123
|
data = r.json()
|
101
124
|
crate_data = data["crate"]
|
102
125
|
latest = crate_data["newest_version"]
|
103
|
-
|
126
|
+
|
104
127
|
# Get readme
|
105
|
-
readme_response = self.session.get(
|
128
|
+
readme_response = self.session.get(
|
129
|
+
f"https://crates.io/api/v1/crates/{crate_name}/readme"
|
130
|
+
)
|
106
131
|
readme = readme_response.text if readme_response.ok else ""
|
107
|
-
|
132
|
+
|
108
133
|
# Get dependencies
|
109
|
-
|
110
|
-
|
111
|
-
|
134
|
+
deps_url = (
|
135
|
+
f"https://crates.io/api/v1/crates/{crate_name}/"
|
136
|
+
f"{latest}/dependencies"
|
137
|
+
)
|
138
|
+
deps_response = self.session.get(deps_url)
|
139
|
+
deps: list[dict[str, Any]] = (
|
140
|
+
deps_response.json().get("dependencies", [])
|
141
|
+
if deps_response.ok
|
142
|
+
else []
|
143
|
+
)
|
144
|
+
|
112
145
|
# Get features - using the versions endpoint
|
113
146
|
features = []
|
114
|
-
versions_response = self.session.get(
|
147
|
+
versions_response = self.session.get(
|
148
|
+
f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
|
149
|
+
)
|
115
150
|
if versions_response.ok:
|
116
151
|
version_data = versions_response.json().get("version", {})
|
117
152
|
features_dict = version_data.get("features", {})
|
118
|
-
features = [
|
119
|
-
|
153
|
+
features = [
|
154
|
+
{"name": k, "dependencies": v} for k, v in features_dict.items()
|
155
|
+
]
|
156
|
+
|
120
157
|
# Repository info and GitHub stars
|
121
158
|
repo = crate_data.get("repository", "")
|
122
159
|
gh_stars = 0
|
123
|
-
|
160
|
+
|
124
161
|
# Check if it's a GitHub repo
|
125
162
|
if "github.com" in repo and self.config.github_token:
|
126
163
|
match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
|
127
164
|
if match:
|
128
165
|
owner, repo_name = match.groups()
|
129
|
-
repo_name = repo_name.split(
|
166
|
+
repo_name = repo_name.split(".")[0] # Handle .git extensions
|
130
167
|
gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
131
|
-
gh_headers
|
168
|
+
gh_headers: dict[str, str] = {}
|
169
|
+
if self.config.github_token:
|
170
|
+
gh_headers["Authorization"] = (
|
171
|
+
f"token {self.config.github_token}"
|
172
|
+
)
|
173
|
+
|
132
174
|
gh = self.session.get(gh_url, headers=gh_headers)
|
133
175
|
if gh.ok:
|
134
176
|
gh_data = gh.json()
|
135
177
|
gh_stars = gh_data.get("stargazers_count", 0)
|
136
|
-
|
178
|
+
|
137
179
|
# Check if it's hosted on lib.rs
|
138
180
|
lib_rs_data = {}
|
139
181
|
if "lib.rs" in repo:
|
140
182
|
lib_rs_url = f"https://lib.rs/crates/{crate_name}"
|
141
183
|
lib_rs_response = self.session.get(lib_rs_url)
|
142
184
|
if lib_rs_response.ok:
|
143
|
-
soup = BeautifulSoup(lib_rs_response.text,
|
185
|
+
soup = BeautifulSoup(lib_rs_response.text, "html.parser")
|
144
186
|
# Get README from lib.rs if not already available
|
145
187
|
if not readme:
|
146
|
-
readme_div = soup.find(
|
188
|
+
readme_div = soup.find("div", class_="readme")
|
147
189
|
if readme_div:
|
148
|
-
readme = readme_div.get_text(
|
149
|
-
|
150
|
-
|
151
|
-
stats_div = soup.find(
|
152
|
-
if stats_div:
|
153
|
-
downloads_text = stats_div.find(
|
190
|
+
readme = readme_div.get_text(
|
191
|
+
strip=True
|
192
|
+
) # Get lib.rs specific stats
|
193
|
+
stats_div = soup.find("div", class_="crate-stats")
|
194
|
+
if isinstance(stats_div, Tag):
|
195
|
+
downloads_text = stats_div.find(
|
196
|
+
string=re.compile(r"[\d,]+ downloads")
|
197
|
+
)
|
154
198
|
if downloads_text:
|
155
|
-
lib_rs_data["librs_downloads"] = int(
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
199
|
+
lib_rs_data["librs_downloads"] = int(
|
200
|
+
re.sub(r"[^\d]", "", str(downloads_text))
|
201
|
+
)
|
202
|
+
|
203
|
+
# Extract code snippets and sections (simplified)
|
204
|
+
code_snippets: list[str] = (
|
205
|
+
[]
|
206
|
+
) # Simplified - would normally extract from readme
|
207
|
+
readme_sections: dict[str, str] = (
|
208
|
+
{}
|
209
|
+
) # Simplified - would normally parse sections
|
210
|
+
|
211
|
+
result: dict[str, Any] = {
|
164
212
|
"name": crate_name,
|
165
213
|
"version": latest,
|
166
214
|
"description": crate_data.get("description", ""),
|
@@ -174,42 +222,55 @@ class CrateAPIClient:
|
|
174
222
|
"code_snippets": code_snippets,
|
175
223
|
"features": features,
|
176
224
|
"readme_sections": readme_sections,
|
177
|
-
**lib_rs_data
|
225
|
+
**lib_rs_data,
|
178
226
|
}
|
179
|
-
|
227
|
+
|
180
228
|
return result
|
181
|
-
|
229
|
+
|
182
230
|
except Exception as e:
|
183
|
-
logging.error(
|
231
|
+
logging.error(
|
232
|
+
f"Failed fetching metadata for {crate_name}: {
|
233
|
+
str(e)}"
|
234
|
+
)
|
184
235
|
raise
|
185
|
-
|
236
|
+
|
186
237
|
# If crates.io fails, try lib.rs
|
187
238
|
try:
|
188
239
|
r = self.session.get(f"https://lib.rs/crates/{crate_name}")
|
189
240
|
if r.ok:
|
190
|
-
soup = BeautifulSoup(r.text,
|
191
|
-
|
241
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
242
|
+
|
192
243
|
# Extract metadata from lib.rs page
|
193
|
-
|
194
|
-
|
244
|
+
h1 = soup.select_one("h1")
|
245
|
+
name = h1.text.strip() if h1 else crate_name
|
246
|
+
|
195
247
|
# Find description
|
196
|
-
desc_elem = soup.select_one(
|
248
|
+
desc_elem = soup.select_one(".description")
|
197
249
|
description = desc_elem.text.strip() if desc_elem else ""
|
198
|
-
|
250
|
+
|
199
251
|
# Find repository link
|
200
|
-
repo_link = None
|
201
|
-
for a in soup.select(
|
202
|
-
|
203
|
-
|
252
|
+
repo_link: Union[str, None] = None
|
253
|
+
for a in soup.select("a"):
|
254
|
+
href = a.get("href")
|
255
|
+
if href and isinstance(href, str) and "github.com" in href:
|
256
|
+
repo_link = href
|
204
257
|
break
|
205
|
-
|
258
|
+
|
259
|
+
# Find keywords
|
260
|
+
keywords_elem = soup.select_one(".keywords")
|
261
|
+
keywords = (
|
262
|
+
[k.text.strip() for k in keywords_elem.find_all("a")]
|
263
|
+
if keywords_elem
|
264
|
+
else []
|
265
|
+
)
|
266
|
+
|
206
267
|
# Basic metadata from lib.rs
|
207
268
|
return {
|
208
269
|
"name": name,
|
209
270
|
"version": "latest", # lib.rs doesn't easily expose version
|
210
271
|
"description": description,
|
211
272
|
"repository": repo_link or "",
|
212
|
-
"keywords":
|
273
|
+
"keywords": keywords,
|
213
274
|
"categories": [],
|
214
275
|
"readme": "",
|
215
276
|
"downloads": 0,
|
@@ -222,22 +283,26 @@ class CrateAPIClient:
|
|
222
283
|
}
|
223
284
|
except Exception:
|
224
285
|
pass
|
225
|
-
|
286
|
+
|
226
287
|
# Finally, try GitHub search
|
227
288
|
try:
|
228
|
-
# This is a simplification - GitHub's search API requires
|
229
|
-
|
289
|
+
# This is a simplification - GitHub's search API requires
|
290
|
+
# authentication
|
291
|
+
gh_search_headers: dict[str, str] = {}
|
230
292
|
if self.config.github_token:
|
231
|
-
|
232
|
-
|
233
|
-
search_url =
|
234
|
-
|
235
|
-
|
293
|
+
gh_search_headers["Authorization"] = f"token {self.config.github_token}"
|
294
|
+
|
295
|
+
search_url = (
|
296
|
+
f"https://api.github.com/search/repositories?"
|
297
|
+
f"q={crate_name}+language:rust"
|
298
|
+
)
|
299
|
+
r = requests.get(search_url, headers=gh_search_headers)
|
300
|
+
|
236
301
|
if r.ok:
|
237
302
|
results = r.json().get("items", [])
|
238
303
|
if results:
|
239
304
|
repo = results[0] # Take first match
|
240
|
-
|
305
|
+
|
241
306
|
# Basic metadata from GitHub
|
242
307
|
return {
|
243
308
|
"name": crate_name,
|
@@ -257,51 +322,6 @@ class CrateAPIClient:
|
|
257
322
|
}
|
258
323
|
except Exception:
|
259
324
|
pass
|
260
|
-
|
325
|
+
|
261
326
|
# If all sources fail
|
262
327
|
return None
|
263
|
-
|
264
|
-
def extract_code_snippets(self, readme: str) -> List[str]:
|
265
|
-
"""Extract code snippets from markdown README"""
|
266
|
-
snippets = []
|
267
|
-
if not readme:
|
268
|
-
return snippets
|
269
|
-
|
270
|
-
# Find Rust code blocks
|
271
|
-
pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
|
272
|
-
matches = re.findall(pattern, readme)
|
273
|
-
|
274
|
-
for code in matches:
|
275
|
-
if len(code.strip()) > 10: # Only include non-trivial snippets
|
276
|
-
snippets.append(code.strip())
|
277
|
-
|
278
|
-
return snippets[:5] # Limit to 5 snippets
|
279
|
-
|
280
|
-
def extract_readme_sections(self, readme: str) -> Dict[str, str]:
|
281
|
-
"""Extract sections from README based on markdown headers"""
|
282
|
-
if not readme:
|
283
|
-
return {}
|
284
|
-
|
285
|
-
sections = {}
|
286
|
-
lines = readme.split('\n')
|
287
|
-
current_section = ""
|
288
|
-
current_content = []
|
289
|
-
|
290
|
-
for line in lines:
|
291
|
-
if re.match(r'^#+\s+', line): # It's a header
|
292
|
-
# Save previous section
|
293
|
-
if current_section and current_content:
|
294
|
-
sections[current_section] = '\n'.join(current_content).strip()
|
295
|
-
|
296
|
-
# Start new section
|
297
|
-
current_section = re.sub(r'^#+\s+', '', line).strip()
|
298
|
-
current_content = []
|
299
|
-
else:
|
300
|
-
if current_section: # Only collect content if we have a section
|
301
|
-
current_content.append(line)
|
302
|
-
|
303
|
-
# Don't forget the last section
|
304
|
-
if current_section and current_content:
|
305
|
-
sections[current_section] = '\n'.join(current_content).strip()
|
306
|
-
|
307
|
-
return sections
|