rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. rust_crate_pipeline/__init__.py +25 -25
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +309 -200
  4. rust_crate_pipeline/analysis.py +304 -368
  5. rust_crate_pipeline/azure_ai_processing.py +453 -0
  6. rust_crate_pipeline/config.py +57 -19
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +42 -36
  14. rust_crate_pipeline/main.py +386 -102
  15. rust_crate_pipeline/network.py +153 -133
  16. rust_crate_pipeline/pipeline.py +340 -264
  17. rust_crate_pipeline/production_config.py +35 -32
  18. rust_crate_pipeline/scraping/__init__.py +13 -0
  19. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  20. rust_crate_pipeline/unified_llm_processor.py +637 -0
  21. rust_crate_pipeline/unified_pipeline.py +548 -0
  22. rust_crate_pipeline/utils/file_utils.py +45 -14
  23. rust_crate_pipeline/utils/logging_utils.py +34 -17
  24. rust_crate_pipeline/version.py +47 -2
  25. rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
  26. rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
  27. rust_crate_pipeline-1.2.5.dist-info/METADATA +0 -573
  28. rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
  29. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
  30. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
  31. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
  32. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,44 +1,57 @@
1
1
  # network.py
2
2
  import os
3
+ import sys
3
4
  import re
4
5
  import time
5
6
  import logging
6
7
  import requests
7
- from requests_cache import CachedSession
8
- from bs4 import BeautifulSoup
9
- from typing import Dict, List, Optional
8
+ from bs4 import BeautifulSoup, Tag
9
+ from typing import Any, Union
10
10
  from .config import PipelineConfig
11
11
 
12
+ # Import utilities
13
+ # Add the parent directory to the path to import utils
14
+ sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
15
+
16
+
12
17
  class GitHubBatchClient:
13
- def __init__(self, config: PipelineConfig):
18
+ def __init__(self, config: PipelineConfig) -> None:
14
19
  self.config = config
15
- self.headers = {"Accept": "application/vnd.github.v3+json"}
20
+ # Simple headers without dependency on HTTPClientUtils
21
+ self.headers = {
22
+ "Accept": "application/vnd.github.v3+json",
23
+ "User-Agent": "SigilDERG-Data-Production/1.0",
24
+ }
16
25
  if config.github_token:
17
26
  self.headers["Authorization"] = f"token {config.github_token}"
18
-
19
- self.session = CachedSession(
20
- 'github_cache',
21
- expire_after=config.cache_ttl * 2 # Longer cache for GitHub
22
- )
27
+
28
+ # Simple session without dependency on HTTPClientUtils
29
+ self.session = requests.Session()
30
+ self.session.headers.update(self.headers)
23
31
  self.remaining_calls = 5000
24
32
  self.reset_time = 0
25
33
 
26
- def check_rate_limit(self):
34
+ def check_rate_limit(self) -> None:
27
35
  """Check and update current rate limit status"""
28
36
  try:
29
- response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
37
+ response = self.session.get(
38
+ "https://api.github.com/rate_limit", headers=self.headers
39
+ )
30
40
  if response.ok:
31
41
  data = response.json()
32
42
  self.remaining_calls = data["resources"]["core"]["remaining"]
33
43
  self.reset_time = data["resources"]["core"]["reset"]
34
-
44
+
35
45
  if self.remaining_calls < 100:
36
46
  reset_in = self.reset_time - time.time()
37
- logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
47
+ logging.warning(
48
+ f"GitHub API rate limit low: {self.remaining_calls} remaining. "
49
+ f"Resets in {reset_in / 60:.1f} minutes"
50
+ )
38
51
  except Exception:
39
52
  pass
40
53
 
41
- def get_repo_stats(self, owner: str, repo: str) -> Dict:
54
+ def get_repo_stats(self, owner: str, repo: str) -> dict[str, Any]:
42
55
  """Get repository statistics"""
43
56
  try:
44
57
  url = f"https://api.github.com/repos/{owner}/{repo}"
@@ -46,52 +59,62 @@ class GitHubBatchClient:
46
59
  if response.ok:
47
60
  return response.json()
48
61
  else:
49
- logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
62
+ logging.warning(
63
+ f"Failed to get repo stats for {owner}/{repo}: "
64
+ f"{response.status_code}"
65
+ )
50
66
  return {}
51
67
  except Exception as e:
52
68
  logging.error(f"Error fetching repo stats: {str(e)}")
53
69
  return {}
54
70
 
55
- def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
71
+ def batch_get_repo_stats(self, repo_list: list[str]) -> dict[str, dict[str, Any]]:
56
72
  """Get statistics for multiple repositories in a batch"""
57
73
  self.check_rate_limit()
58
-
59
- results = {}
74
+
75
+ results: dict[str, dict[str, Any]] = {}
60
76
  for repo_url in repo_list:
61
77
  # Extract owner/repo from URL
62
78
  match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
63
79
  if not match:
64
80
  continue
65
-
81
+
66
82
  owner, repo = match.groups()
67
- repo = repo.split('.')[0] # Remove .git extension if present
68
-
83
+ repo = repo.split(".")[0] # Remove .git extension if present
84
+
69
85
  # Get stats
70
86
  stats = self.get_repo_stats(owner, repo)
71
87
  results[repo_url] = stats
72
-
88
+
73
89
  # Be nice to GitHub API
74
90
  time.sleep(0.1)
75
-
76
91
  return results
77
92
 
93
+
78
94
  class CrateAPIClient:
79
- def __init__(self, config: PipelineConfig):
95
+ def __init__(self, config: PipelineConfig) -> None:
80
96
  self.config = config
81
- self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
82
-
83
- def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
97
+ # Simple session without dependency on HTTPClientUtils
98
+ self.session = requests.Session()
99
+ self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.0"})
100
+
101
+ def fetch_crate_metadata(self, crate_name: str) -> dict[str, Any] | None:
84
102
  """Fetch metadata with retry logic"""
85
103
  for attempt in range(self.config.max_retries):
86
104
  try:
87
105
  return self._fetch_metadata(crate_name)
88
106
  except Exception as e:
89
- logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
90
- wait = 2 ** attempt
107
+ logging.warning(
108
+ f"Attempt {
109
+ attempt +
110
+ 1} failed for {crate_name}: {
111
+ str(e)}"
112
+ )
113
+ wait = 2**attempt
91
114
  time.sleep(wait)
92
115
  return None
93
116
 
94
- def _fetch_metadata(self, crate_name: str) -> Optional[Dict]:
117
+ def _fetch_metadata(self, crate_name: str) -> dict[str, Any] | None:
95
118
  """Enhanced metadata fetching that tries multiple sources"""
96
119
  # First try crates.io (primary source)
97
120
  try:
@@ -100,67 +123,92 @@ class CrateAPIClient:
100
123
  data = r.json()
101
124
  crate_data = data["crate"]
102
125
  latest = crate_data["newest_version"]
103
-
126
+
104
127
  # Get readme
105
- readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
128
+ readme_response = self.session.get(
129
+ f"https://crates.io/api/v1/crates/{crate_name}/readme"
130
+ )
106
131
  readme = readme_response.text if readme_response.ok else ""
107
-
132
+
108
133
  # Get dependencies
109
- deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
110
- deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
111
-
134
+ deps_url = (
135
+ f"https://crates.io/api/v1/crates/{crate_name}/"
136
+ f"{latest}/dependencies"
137
+ )
138
+ deps_response = self.session.get(deps_url)
139
+ deps: list[dict[str, Any]] = (
140
+ deps_response.json().get("dependencies", [])
141
+ if deps_response.ok
142
+ else []
143
+ )
144
+
112
145
  # Get features - using the versions endpoint
113
146
  features = []
114
- versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
147
+ versions_response = self.session.get(
148
+ f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
149
+ )
115
150
  if versions_response.ok:
116
151
  version_data = versions_response.json().get("version", {})
117
152
  features_dict = version_data.get("features", {})
118
- features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
119
-
153
+ features = [
154
+ {"name": k, "dependencies": v} for k, v in features_dict.items()
155
+ ]
156
+
120
157
  # Repository info and GitHub stars
121
158
  repo = crate_data.get("repository", "")
122
159
  gh_stars = 0
123
-
160
+
124
161
  # Check if it's a GitHub repo
125
162
  if "github.com" in repo and self.config.github_token:
126
163
  match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
127
164
  if match:
128
165
  owner, repo_name = match.groups()
129
- repo_name = repo_name.split('.')[0] # Handle .git extensions
166
+ repo_name = repo_name.split(".")[0] # Handle .git extensions
130
167
  gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
131
- gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
168
+ gh_headers: dict[str, str] = {}
169
+ if self.config.github_token:
170
+ gh_headers["Authorization"] = (
171
+ f"token {self.config.github_token}"
172
+ )
173
+
132
174
  gh = self.session.get(gh_url, headers=gh_headers)
133
175
  if gh.ok:
134
176
  gh_data = gh.json()
135
177
  gh_stars = gh_data.get("stargazers_count", 0)
136
-
178
+
137
179
  # Check if it's hosted on lib.rs
138
180
  lib_rs_data = {}
139
181
  if "lib.rs" in repo:
140
182
  lib_rs_url = f"https://lib.rs/crates/{crate_name}"
141
183
  lib_rs_response = self.session.get(lib_rs_url)
142
184
  if lib_rs_response.ok:
143
- soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
185
+ soup = BeautifulSoup(lib_rs_response.text, "html.parser")
144
186
  # Get README from lib.rs if not already available
145
187
  if not readme:
146
- readme_div = soup.find('div', class_='readme')
188
+ readme_div = soup.find("div", class_="readme")
147
189
  if readme_div:
148
- readme = readme_div.get_text(strip=True)
149
-
150
- # Get lib.rs specific stats
151
- stats_div = soup.find('div', class_='crate-stats')
152
- if stats_div:
153
- downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
190
+ readme = readme_div.get_text(
191
+ strip=True
192
+ ) # Get lib.rs specific stats
193
+ stats_div = soup.find("div", class_="crate-stats")
194
+ if isinstance(stats_div, Tag):
195
+ downloads_text = stats_div.find(
196
+ string=re.compile(r"[\d,]+ downloads")
197
+ )
154
198
  if downloads_text:
155
- lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
156
-
157
- # Extract code snippets from readme
158
- code_snippets = self.extract_code_snippets(readme)
159
-
160
- # Extract sections from readme
161
- readme_sections = self.extract_readme_sections(readme) if readme else {}
162
-
163
- result = {
199
+ lib_rs_data["librs_downloads"] = int(
200
+ re.sub(r"[^\d]", "", str(downloads_text))
201
+ )
202
+
203
+ # Extract code snippets and sections (simplified)
204
+ code_snippets: list[str] = (
205
+ []
206
+ ) # Simplified - would normally extract from readme
207
+ readme_sections: dict[str, str] = (
208
+ {}
209
+ ) # Simplified - would normally parse sections
210
+
211
+ result: dict[str, Any] = {
164
212
  "name": crate_name,
165
213
  "version": latest,
166
214
  "description": crate_data.get("description", ""),
@@ -174,42 +222,55 @@ class CrateAPIClient:
174
222
  "code_snippets": code_snippets,
175
223
  "features": features,
176
224
  "readme_sections": readme_sections,
177
- **lib_rs_data
225
+ **lib_rs_data,
178
226
  }
179
-
227
+
180
228
  return result
181
-
229
+
182
230
  except Exception as e:
183
- logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
231
+ logging.error(
232
+ f"Failed fetching metadata for {crate_name}: {
233
+ str(e)}"
234
+ )
184
235
  raise
185
-
236
+
186
237
  # If crates.io fails, try lib.rs
187
238
  try:
188
239
  r = self.session.get(f"https://lib.rs/crates/{crate_name}")
189
240
  if r.ok:
190
- soup = BeautifulSoup(r.text, 'html.parser')
191
-
241
+ soup = BeautifulSoup(r.text, "html.parser")
242
+
192
243
  # Extract metadata from lib.rs page
193
- name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
194
-
244
+ h1 = soup.select_one("h1")
245
+ name = h1.text.strip() if h1 else crate_name
246
+
195
247
  # Find description
196
- desc_elem = soup.select_one('.description')
248
+ desc_elem = soup.select_one(".description")
197
249
  description = desc_elem.text.strip() if desc_elem else ""
198
-
250
+
199
251
  # Find repository link
200
- repo_link = None
201
- for a in soup.select('a'):
202
- if 'github.com' in a.get('href', ''):
203
- repo_link = a['href']
252
+ repo_link: Union[str, None] = None
253
+ for a in soup.select("a"):
254
+ href = a.get("href")
255
+ if href and isinstance(href, str) and "github.com" in href:
256
+ repo_link = href
204
257
  break
205
-
258
+
259
+ # Find keywords
260
+ keywords_elem = soup.select_one(".keywords")
261
+ keywords = (
262
+ [k.text.strip() for k in keywords_elem.find_all("a")]
263
+ if keywords_elem
264
+ else []
265
+ )
266
+
206
267
  # Basic metadata from lib.rs
207
268
  return {
208
269
  "name": name,
209
270
  "version": "latest", # lib.rs doesn't easily expose version
210
271
  "description": description,
211
272
  "repository": repo_link or "",
212
- "keywords": [],
273
+ "keywords": keywords,
213
274
  "categories": [],
214
275
  "readme": "",
215
276
  "downloads": 0,
@@ -222,22 +283,26 @@ class CrateAPIClient:
222
283
  }
223
284
  except Exception:
224
285
  pass
225
-
286
+
226
287
  # Finally, try GitHub search
227
288
  try:
228
- # This is a simplification - GitHub's search API requires authentication
229
- headers = {}
289
+ # This is a simplification - GitHub's search API requires
290
+ # authentication
291
+ gh_search_headers: dict[str, str] = {}
230
292
  if self.config.github_token:
231
- headers["Authorization"] = f"token {self.config.github_token}"
232
-
233
- search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
234
- r = requests.get(search_url, headers=headers)
235
-
293
+ gh_search_headers["Authorization"] = f"token {self.config.github_token}"
294
+
295
+ search_url = (
296
+ f"https://api.github.com/search/repositories?"
297
+ f"q={crate_name}+language:rust"
298
+ )
299
+ r = requests.get(search_url, headers=gh_search_headers)
300
+
236
301
  if r.ok:
237
302
  results = r.json().get("items", [])
238
303
  if results:
239
304
  repo = results[0] # Take first match
240
-
305
+
241
306
  # Basic metadata from GitHub
242
307
  return {
243
308
  "name": crate_name,
@@ -257,51 +322,6 @@ class CrateAPIClient:
257
322
  }
258
323
  except Exception:
259
324
  pass
260
-
325
+
261
326
  # If all sources fail
262
327
  return None
263
-
264
- def extract_code_snippets(self, readme: str) -> List[str]:
265
- """Extract code snippets from markdown README"""
266
- snippets = []
267
- if not readme:
268
- return snippets
269
-
270
- # Find Rust code blocks
271
- pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
272
- matches = re.findall(pattern, readme)
273
-
274
- for code in matches:
275
- if len(code.strip()) > 10: # Only include non-trivial snippets
276
- snippets.append(code.strip())
277
-
278
- return snippets[:5] # Limit to 5 snippets
279
-
280
- def extract_readme_sections(self, readme: str) -> Dict[str, str]:
281
- """Extract sections from README based on markdown headers"""
282
- if not readme:
283
- return {}
284
-
285
- sections = {}
286
- lines = readme.split('\n')
287
- current_section = ""
288
- current_content = []
289
-
290
- for line in lines:
291
- if re.match(r'^#+\s+', line): # It's a header
292
- # Save previous section
293
- if current_section and current_content:
294
- sections[current_section] = '\n'.join(current_content).strip()
295
-
296
- # Start new section
297
- current_section = re.sub(r'^#+\s+', '', line).strip()
298
- current_content = []
299
- else:
300
- if current_section: # Only collect content if we have a section
301
- current_content.append(line)
302
-
303
- # Don't forget the last section
304
- if current_section and current_content:
305
- sections[current_section] = '\n'.join(current_content).strip()
306
-
307
- return sections