rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/network.py
CHANGED
@@ -1,308 +1,317 @@
|
|
1
|
-
# network.py
|
2
|
-
import os
|
3
|
-
import
|
4
|
-
import
|
5
|
-
import time
|
6
|
-
import logging
|
7
|
-
import requests
|
8
|
-
from
|
9
|
-
from
|
10
|
-
from .config import PipelineConfig
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
def
|
50
|
-
"""
|
51
|
-
try:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
def
|
109
|
-
|
110
|
-
#
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
#
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
if
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
"
|
208
|
-
"
|
209
|
-
"
|
210
|
-
"
|
211
|
-
"
|
212
|
-
"
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
#
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
# Find
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
"
|
260
|
-
"
|
261
|
-
"
|
262
|
-
"
|
263
|
-
"
|
264
|
-
"
|
265
|
-
"
|
266
|
-
"
|
267
|
-
"
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
"
|
299
|
-
"
|
300
|
-
"
|
301
|
-
"
|
302
|
-
"
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
1
|
+
# network.py
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
import sys
|
5
|
+
import time
|
6
|
+
import logging
|
7
|
+
import requests
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
9
|
+
from bs4 import BeautifulSoup, Tag
|
10
|
+
from .config import PipelineConfig
|
11
|
+
|
12
|
+
|
13
|
+
class GitHubBatchClient:
|
14
|
+
def __init__(self, config: PipelineConfig) -> None:
|
15
|
+
self.config = config
|
16
|
+
# Simple headers without dependency on HTTPClientUtils
|
17
|
+
self.headers = {
|
18
|
+
"Accept": "application/vnd.github.v3+json",
|
19
|
+
"User-Agent": "SigilDERG-Data-Production/1.3.2",
|
20
|
+
}
|
21
|
+
if config.github_token:
|
22
|
+
self.headers["Authorization"] = f"token {config.github_token}"
|
23
|
+
|
24
|
+
# Simple session without dependency on HTTPClientUtils
|
25
|
+
self.session = requests.Session()
|
26
|
+
self.session.headers.update(self.headers)
|
27
|
+
self.remaining_calls = 5000
|
28
|
+
self.reset_time = 0
|
29
|
+
|
30
|
+
def check_rate_limit(self) -> None:
|
31
|
+
"""Check and update current rate limit status"""
|
32
|
+
try:
|
33
|
+
response = self.session.get(
|
34
|
+
"https://api.github.com/rate_limit", headers=self.headers
|
35
|
+
)
|
36
|
+
if response.ok:
|
37
|
+
data = response.json()
|
38
|
+
self.remaining_calls = data["resources"]["core"]["remaining"]
|
39
|
+
self.reset_time = data["resources"]["core"]["reset"]
|
40
|
+
|
41
|
+
if self.remaining_calls < 100:
|
42
|
+
reset_in = self.reset_time - time.time()
|
43
|
+
logging.warning(
|
44
|
+
f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in / 60:.1f} minutes"
|
45
|
+
)
|
46
|
+
except Exception:
|
47
|
+
pass
|
48
|
+
|
49
|
+
def get_repo_stats(self, owner: str, repo: str) -> "dict[str, Any]":
|
50
|
+
"""Get repository statistics"""
|
51
|
+
try:
|
52
|
+
url = f"https://api.github.com/repos/{owner}/{repo}"
|
53
|
+
response = self.session.get(url, headers=self.headers)
|
54
|
+
if response.ok:
|
55
|
+
return response.json()
|
56
|
+
else:
|
57
|
+
logging.warning(
|
58
|
+
f"Failed to get repo stats for {owner}/{repo}: {response.status_code}"
|
59
|
+
)
|
60
|
+
return {}
|
61
|
+
except Exception as e:
|
62
|
+
logging.error(f"Error fetching repo stats: {str(e)}")
|
63
|
+
return {}
|
64
|
+
|
65
|
+
def batch_get_repo_stats(self, repo_list: "list[str]") -> "dict[str, dict[str, Any]]":
|
66
|
+
"""Get statistics for multiple repositories in a batch"""
|
67
|
+
self.check_rate_limit()
|
68
|
+
|
69
|
+
results: "dict[str, dict[str, Any]]" = {}
|
70
|
+
for repo_url in repo_list:
|
71
|
+
# Extract owner/repo from URL
|
72
|
+
match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
|
73
|
+
if not match:
|
74
|
+
continue
|
75
|
+
|
76
|
+
owner, repo = match.groups()
|
77
|
+
repo = repo.split(".")[0] # Remove .git extension if present
|
78
|
+
|
79
|
+
# Get stats
|
80
|
+
stats = self.get_repo_stats(owner, repo)
|
81
|
+
results[repo_url] = stats
|
82
|
+
|
83
|
+
# Be nice to GitHub API
|
84
|
+
time.sleep(0.1)
|
85
|
+
return results
|
86
|
+
|
87
|
+
|
88
|
+
class CrateAPIClient:
|
89
|
+
def __init__(self, config: PipelineConfig) -> None:
|
90
|
+
self.config = config
|
91
|
+
# Simple session without dependency on HTTPClientUtils
|
92
|
+
self.session = requests.Session()
|
93
|
+
self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.3.2"})
|
94
|
+
|
95
|
+
def fetch_crate_metadata(self, crate_name: str) -> "dict[str, Any] | None":
|
96
|
+
"""Fetch metadata with retry logic"""
|
97
|
+
for attempt in range(self.config.max_retries):
|
98
|
+
try:
|
99
|
+
return self._fetch_metadata(crate_name)
|
100
|
+
except Exception as e:
|
101
|
+
logging.warning(
|
102
|
+
f"Attempt {attempt + 1} failed for {crate_name}: {str(e)}"
|
103
|
+
)
|
104
|
+
wait = 2**attempt
|
105
|
+
time.sleep(wait)
|
106
|
+
return None
|
107
|
+
|
108
|
+
def _fetch_metadata(self, crate_name: str) -> "dict[str, Any] | None":
|
109
|
+
"""Enhanced metadata fetching that tries multiple sources"""
|
110
|
+
# First try crates.io (primary source)
|
111
|
+
try:
|
112
|
+
r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
|
113
|
+
if r.ok:
|
114
|
+
data = r.json()
|
115
|
+
crate_data = data["crate"]
|
116
|
+
latest = crate_data["newest_version"]
|
117
|
+
|
118
|
+
# Get readme
|
119
|
+
readme_response = self.session.get(
|
120
|
+
f"https://crates.io/api/v1/crates/{crate_name}/readme"
|
121
|
+
)
|
122
|
+
readme = readme_response.text if readme_response.ok else ""
|
123
|
+
|
124
|
+
# Get dependencies
|
125
|
+
deps_url = (
|
126
|
+
f"https://crates.io/api/v1/crates/{crate_name}/"
|
127
|
+
f"{latest}/dependencies"
|
128
|
+
)
|
129
|
+
deps_response = self.session.get(deps_url)
|
130
|
+
deps: list[dict[str, Any]] = (
|
131
|
+
deps_response.json().get("dependencies", [])
|
132
|
+
if deps_response.ok
|
133
|
+
else []
|
134
|
+
)
|
135
|
+
|
136
|
+
# Get features - using the versions endpoint
|
137
|
+
features = []
|
138
|
+
versions_response = self.session.get(
|
139
|
+
f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
|
140
|
+
)
|
141
|
+
if versions_response.ok:
|
142
|
+
version_data = versions_response.json().get("version", {})
|
143
|
+
features_dict = version_data.get("features", {})
|
144
|
+
features = [
|
145
|
+
{"name": k, "dependencies": v} for k, v in features_dict.items()
|
146
|
+
]
|
147
|
+
|
148
|
+
# Repository info and GitHub stars
|
149
|
+
repo = crate_data.get("repository", "")
|
150
|
+
gh_stars = 0
|
151
|
+
|
152
|
+
# Check if it's a GitHub repo
|
153
|
+
if "github.com" in repo and self.config.github_token:
|
154
|
+
match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
|
155
|
+
if match:
|
156
|
+
owner, repo_name = match.groups()
|
157
|
+
repo_name = repo_name.split(".")[0] # Handle .git extensions
|
158
|
+
gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
159
|
+
gh_headers: dict[str, str] = {}
|
160
|
+
if self.config.github_token:
|
161
|
+
gh_headers["Authorization"] = (
|
162
|
+
f"token {self.config.github_token}"
|
163
|
+
)
|
164
|
+
|
165
|
+
gh = self.session.get(gh_url, headers=gh_headers)
|
166
|
+
if gh.ok:
|
167
|
+
gh_data = gh.json()
|
168
|
+
gh_stars = gh_data.get("stargazers_count", 0)
|
169
|
+
|
170
|
+
# Check if it's hosted on lib.rs
|
171
|
+
lib_rs_data = {}
|
172
|
+
if "lib.rs" in repo:
|
173
|
+
lib_rs_url = f"https://lib.rs/crates/{crate_name}"
|
174
|
+
lib_rs_response = self.session.get(lib_rs_url)
|
175
|
+
if lib_rs_response.ok:
|
176
|
+
soup = BeautifulSoup(lib_rs_response.text, "html.parser")
|
177
|
+
# Get README from lib.rs if not already available
|
178
|
+
if not readme:
|
179
|
+
readme_div = soup.find("div", class_="readme")
|
180
|
+
if readme_div:
|
181
|
+
readme = readme_div.get_text(
|
182
|
+
strip=True
|
183
|
+
) # Get lib.rs specific stats
|
184
|
+
stats_div = soup.find("div", class_="crate-stats")
|
185
|
+
if isinstance(stats_div, Tag):
|
186
|
+
downloads_text = stats_div.find(
|
187
|
+
string=re.compile(r"[\d,]+ downloads")
|
188
|
+
)
|
189
|
+
if downloads_text:
|
190
|
+
lib_rs_data["librs_downloads"] = int(
|
191
|
+
re.sub(r"[^\d]", "", str(downloads_text))
|
192
|
+
)
|
193
|
+
|
194
|
+
# Extract code snippets and sections (simplified)
|
195
|
+
code_snippets: list[str] = (
|
196
|
+
[]
|
197
|
+
) # Simplified - would normally extract from readme
|
198
|
+
readme_sections: dict[str, str] = (
|
199
|
+
{}
|
200
|
+
) # Simplified - would normally parse sections
|
201
|
+
|
202
|
+
result: dict[str, Any] = {
|
203
|
+
"name": crate_name,
|
204
|
+
"version": latest,
|
205
|
+
"description": crate_data.get("description", ""),
|
206
|
+
"repository": repo,
|
207
|
+
"keywords": crate_data.get("keywords", []),
|
208
|
+
"categories": crate_data.get("categories", []),
|
209
|
+
"readme": readme,
|
210
|
+
"downloads": crate_data.get("downloads", 0),
|
211
|
+
"github_stars": gh_stars,
|
212
|
+
"dependencies": deps,
|
213
|
+
"code_snippets": code_snippets,
|
214
|
+
"features": features,
|
215
|
+
"readme_sections": readme_sections,
|
216
|
+
**lib_rs_data,
|
217
|
+
}
|
218
|
+
|
219
|
+
return result
|
220
|
+
|
221
|
+
except Exception as e:
|
222
|
+
logging.error(
|
223
|
+
f"Failed fetching metadata for {crate_name}: {str(e)}"
|
224
|
+
)
|
225
|
+
raise
|
226
|
+
|
227
|
+
# If crates.io fails, try lib.rs
|
228
|
+
try:
|
229
|
+
r = self.session.get(f"https://lib.rs/crates/{crate_name}")
|
230
|
+
if r.ok:
|
231
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
232
|
+
|
233
|
+
# Extract metadata from lib.rs page
|
234
|
+
h1 = soup.select_one("h1")
|
235
|
+
name = h1.text.strip() if h1 else crate_name
|
236
|
+
|
237
|
+
# Find description
|
238
|
+
desc_elem = soup.select_one(".description")
|
239
|
+
description = desc_elem.text.strip() if desc_elem else ""
|
240
|
+
|
241
|
+
# Find repository link
|
242
|
+
repo_link: Union[str, None] = None
|
243
|
+
for a in soup.select("a"):
|
244
|
+
href = a.get("href")
|
245
|
+
if href and isinstance(href, str) and "github.com" in href:
|
246
|
+
repo_link = href
|
247
|
+
break
|
248
|
+
|
249
|
+
# Find keywords
|
250
|
+
keywords_elem = soup.select_one(".keywords")
|
251
|
+
keywords = (
|
252
|
+
[k.text.strip() for k in keywords_elem.find_all("a")]
|
253
|
+
if keywords_elem
|
254
|
+
else []
|
255
|
+
)
|
256
|
+
|
257
|
+
# Basic metadata from lib.rs
|
258
|
+
return {
|
259
|
+
"name": name,
|
260
|
+
"version": "latest", # lib.rs doesn't easily expose version
|
261
|
+
"description": description,
|
262
|
+
"repository": repo_link or "",
|
263
|
+
"keywords": keywords,
|
264
|
+
"categories": [],
|
265
|
+
"readme": "",
|
266
|
+
"downloads": 0,
|
267
|
+
"github_stars": 0,
|
268
|
+
"dependencies": [],
|
269
|
+
"code_snippets": [],
|
270
|
+
"features": [],
|
271
|
+
"readme_sections": {},
|
272
|
+
"source": "lib.rs",
|
273
|
+
}
|
274
|
+
except Exception:
|
275
|
+
pass
|
276
|
+
|
277
|
+
# Finally, try GitHub search
|
278
|
+
try:
|
279
|
+
# This is a simplification - GitHub's search API requires
|
280
|
+
# authentication
|
281
|
+
gh_search_headers: dict[str, str] = {}
|
282
|
+
if self.config.github_token:
|
283
|
+
gh_search_headers["Authorization"] = f"token {self.config.github_token}"
|
284
|
+
|
285
|
+
search_url = (
|
286
|
+
f"https://api.github.com/search/repositories?"
|
287
|
+
f"q={crate_name}+language:rust"
|
288
|
+
)
|
289
|
+
r = requests.get(search_url, headers=gh_search_headers)
|
290
|
+
|
291
|
+
if r.ok:
|
292
|
+
results = r.json().get("items", [])
|
293
|
+
if results:
|
294
|
+
repo = results[0] # Take first match
|
295
|
+
|
296
|
+
# Basic metadata from GitHub
|
297
|
+
return {
|
298
|
+
"name": crate_name,
|
299
|
+
"version": "unknown",
|
300
|
+
"description": repo.get("description", ""),
|
301
|
+
"repository": repo.get("html_url", ""),
|
302
|
+
"keywords": [],
|
303
|
+
"categories": [],
|
304
|
+
"readme": "",
|
305
|
+
"downloads": 0,
|
306
|
+
"github_stars": repo.get("stargazers_count", 0),
|
307
|
+
"dependencies": [],
|
308
|
+
"code_snippets": [],
|
309
|
+
"features": [],
|
310
|
+
"readme_sections": {},
|
311
|
+
"source": "github",
|
312
|
+
}
|
313
|
+
except Exception:
|
314
|
+
pass
|
315
|
+
|
316
|
+
# If all sources fail
|
317
|
+
return None
|