rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,40 +1,68 @@
1
1
  # network.py
2
2
  import os
3
+ import sys
3
4
  import re
4
5
  import time
5
6
  import logging
6
7
  import requests
7
- from requests_cache import CachedSession
8
8
  from bs4 import BeautifulSoup
9
9
  from typing import Dict, List, Optional
10
10
  from .config import PipelineConfig
11
11
 
12
+ # Import utilities with fallback
13
+ try:
14
+ # Add the parent directory to the path to import utils
15
+ sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
16
+ from utils.http_client_utils import HTTPClientUtils, MetadataExtractor
17
+ except ImportError:
18
+ # Fallback implementations for when utils are not available
19
+ class HTTPClientUtils:
20
+ def __init__(self):
21
+ pass
22
+
23
+ class MetadataExtractor:
24
+ def __init__(self):
25
+ pass
26
+
27
+ # Import atomic utilities for code reuse
28
+ import sys
29
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
30
+
31
+
12
32
  class GitHubBatchClient:
13
33
  def __init__(self, config: PipelineConfig):
14
34
  self.config = config
15
- self.headers = {"Accept": "application/vnd.github.v3+json"}
35
+ # Simple headers without dependency on HTTPClientUtils
36
+ self.headers = {
37
+ "Accept": "application/vnd.github.v3+json",
38
+ "User-Agent": "SigilDERG-Data-Production/1.0"
39
+ }
16
40
  if config.github_token:
17
41
  self.headers["Authorization"] = f"token {config.github_token}"
18
-
19
- self.session = CachedSession(
20
- 'github_cache',
21
- expire_after=config.cache_ttl * 2 # Longer cache for GitHub
22
- )
42
+
43
+ # Simple session without dependency on HTTPClientUtils
44
+ self.session = requests.Session()
45
+ self.session.headers.update(self.headers)
23
46
  self.remaining_calls = 5000
24
47
  self.reset_time = 0
25
48
 
26
49
  def check_rate_limit(self):
27
50
  """Check and update current rate limit status"""
28
51
  try:
29
- response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
52
+ response = self.session.get(
53
+ "https://api.github.com/rate_limit",
54
+ headers=self.headers)
30
55
  if response.ok:
31
56
  data = response.json()
32
57
  self.remaining_calls = data["resources"]["core"]["remaining"]
33
58
  self.reset_time = data["resources"]["core"]["reset"]
34
-
59
+
35
60
  if self.remaining_calls < 100:
36
61
  reset_in = self.reset_time - time.time()
37
- logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
62
+ logging.warning(
63
+ f"GitHub API rate limit low: {
64
+ self.remaining_calls} remaining. Resets in {
65
+ reset_in / 60:.1f} minutes")
38
66
  except Exception:
39
67
  pass
40
68
 
@@ -46,7 +74,8 @@ class GitHubBatchClient:
46
74
  if response.ok:
47
75
  return response.json()
48
76
  else:
49
- logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
77
+ logging.warning(
78
+ f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
50
79
  return {}
51
80
  except Exception as e:
52
81
  logging.error(f"Error fetching repo stats: {str(e)}")
@@ -55,38 +84,46 @@ class GitHubBatchClient:
55
84
  def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
56
85
  """Get statistics for multiple repositories in a batch"""
57
86
  self.check_rate_limit()
58
-
87
+
59
88
  results = {}
60
89
  for repo_url in repo_list:
61
90
  # Extract owner/repo from URL
62
91
  match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
63
92
  if not match:
64
93
  continue
65
-
94
+
66
95
  owner, repo = match.groups()
67
96
  repo = repo.split('.')[0] # Remove .git extension if present
68
-
97
+
69
98
  # Get stats
70
99
  stats = self.get_repo_stats(owner, repo)
71
100
  results[repo_url] = stats
72
-
101
+
73
102
  # Be nice to GitHub API
74
103
  time.sleep(0.1)
75
-
76
104
  return results
77
105
 
106
+
78
107
  class CrateAPIClient:
79
108
  def __init__(self, config: PipelineConfig):
80
109
  self.config = config
81
- self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
82
-
110
+ # Simple session without dependency on HTTPClientUtils
111
+ self.session = requests.Session()
112
+ self.session.headers.update({
113
+ "User-Agent": "SigilDERG-Data-Production/1.0"
114
+ })
115
+
83
116
  def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
84
117
  """Fetch metadata with retry logic"""
85
118
  for attempt in range(self.config.max_retries):
86
119
  try:
87
120
  return self._fetch_metadata(crate_name)
88
121
  except Exception as e:
89
- logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
122
+ logging.warning(
123
+ f"Attempt {
124
+ attempt +
125
+ 1} failed for {crate_name}: {
126
+ str(e)}")
90
127
  wait = 2 ** attempt
91
128
  time.sleep(wait)
92
129
  return None
@@ -95,71 +132,77 @@ class CrateAPIClient:
95
132
  """Enhanced metadata fetching that tries multiple sources"""
96
133
  # First try crates.io (primary source)
97
134
  try:
98
- r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
135
+ r = self.session.get(
136
+ f"https://crates.io/api/v1/crates/{crate_name}")
99
137
  if r.ok:
100
138
  data = r.json()
101
139
  crate_data = data["crate"]
102
140
  latest = crate_data["newest_version"]
103
-
141
+
104
142
  # Get readme
105
- readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
143
+ readme_response = self.session.get(
144
+ f"https://crates.io/api/v1/crates/{crate_name}/readme")
106
145
  readme = readme_response.text if readme_response.ok else ""
107
-
146
+
108
147
  # Get dependencies
109
- deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
148
+ deps_response = self.session.get(
149
+ f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
110
150
  deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
111
-
151
+
112
152
  # Get features - using the versions endpoint
113
153
  features = []
114
- versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
154
+ versions_response = self.session.get(
155
+ f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
115
156
  if versions_response.ok:
116
157
  version_data = versions_response.json().get("version", {})
117
158
  features_dict = version_data.get("features", {})
118
- features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
119
-
159
+ features = [{"name": k, "dependencies": v}
160
+ for k, v in features_dict.items()]
161
+
120
162
  # Repository info and GitHub stars
121
163
  repo = crate_data.get("repository", "")
122
164
  gh_stars = 0
123
-
165
+
124
166
  # Check if it's a GitHub repo
125
167
  if "github.com" in repo and self.config.github_token:
126
168
  match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
127
169
  if match:
128
170
  owner, repo_name = match.groups()
129
- repo_name = repo_name.split('.')[0] # Handle .git extensions
171
+ repo_name = repo_name.split(
172
+ '.')[0] # Handle .git extensions
130
173
  gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
131
- gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
174
+ gh_headers = {
175
+ "Authorization": f"token {
176
+ self.config.github_token}"} if self.config.github_token else {}
132
177
  gh = self.session.get(gh_url, headers=gh_headers)
133
178
  if gh.ok:
134
179
  gh_data = gh.json()
135
180
  gh_stars = gh_data.get("stargazers_count", 0)
136
-
181
+
137
182
  # Check if it's hosted on lib.rs
138
183
  lib_rs_data = {}
139
184
  if "lib.rs" in repo:
140
185
  lib_rs_url = f"https://lib.rs/crates/{crate_name}"
141
186
  lib_rs_response = self.session.get(lib_rs_url)
142
187
  if lib_rs_response.ok:
143
- soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
188
+ soup = BeautifulSoup(
189
+ lib_rs_response.text, 'html.parser')
144
190
  # Get README from lib.rs if not already available
145
191
  if not readme:
146
192
  readme_div = soup.find('div', class_='readme')
147
193
  if readme_div:
148
- readme = readme_div.get_text(strip=True)
149
-
150
- # Get lib.rs specific stats
194
+ readme = readme_div.get_text(strip=True) # Get lib.rs specific stats
151
195
  stats_div = soup.find('div', class_='crate-stats')
152
196
  if stats_div:
153
- downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
154
- if downloads_text:
155
- lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
156
-
157
- # Extract code snippets from readme
158
- code_snippets = self.extract_code_snippets(readme)
159
-
160
- # Extract sections from readme
161
- readme_sections = self.extract_readme_sections(readme) if readme else {}
162
-
197
+ downloads_text = stats_div.find(
198
+ string=re.compile(r'[\d,]+ downloads'))
199
+ if downloads_text: lib_rs_data["librs_downloads"] = int(
200
+ re.sub(r'[^\d]', '', str(downloads_text)))
201
+
202
+ # Extract code snippets and sections (simplified)
203
+ code_snippets = [] # Simplified - would normally extract from readme
204
+ readme_sections = {} # Simplified - would normally parse sections
205
+
163
206
  result = {
164
207
  "name": crate_name,
165
208
  "version": latest,
@@ -176,33 +219,36 @@ class CrateAPIClient:
176
219
  "readme_sections": readme_sections,
177
220
  **lib_rs_data
178
221
  }
179
-
222
+
180
223
  return result
181
-
224
+
182
225
  except Exception as e:
183
- logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
226
+ logging.error(
227
+ f"Failed fetching metadata for {crate_name}: {
228
+ str(e)}")
184
229
  raise
185
-
230
+
186
231
  # If crates.io fails, try lib.rs
187
232
  try:
188
233
  r = self.session.get(f"https://lib.rs/crates/{crate_name}")
189
234
  if r.ok:
190
235
  soup = BeautifulSoup(r.text, 'html.parser')
191
-
236
+
192
237
  # Extract metadata from lib.rs page
193
- name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
194
-
238
+ name = soup.select_one('h1').text.strip(
239
+ ) if soup.select_one('h1') else crate_name
240
+
195
241
  # Find description
196
242
  desc_elem = soup.select_one('.description')
197
243
  description = desc_elem.text.strip() if desc_elem else ""
198
-
244
+
199
245
  # Find repository link
200
246
  repo_link = None
201
247
  for a in soup.select('a'):
202
- if 'github.com' in a.get('href', ''):
248
+ if 'github.com' in a.get('hre', ''):
203
249
  repo_link = a['href']
204
250
  break
205
-
251
+
206
252
  # Basic metadata from lib.rs
207
253
  return {
208
254
  "name": name,
@@ -222,22 +268,23 @@ class CrateAPIClient:
222
268
  }
223
269
  except Exception:
224
270
  pass
225
-
271
+
226
272
  # Finally, try GitHub search
227
273
  try:
228
- # This is a simplification - GitHub's search API requires authentication
274
+ # This is a simplification - GitHub's search API requires
275
+ # authentication
229
276
  headers = {}
230
277
  if self.config.github_token:
231
278
  headers["Authorization"] = f"token {self.config.github_token}"
232
-
279
+
233
280
  search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
234
281
  r = requests.get(search_url, headers=headers)
235
-
282
+
236
283
  if r.ok:
237
284
  results = r.json().get("items", [])
238
285
  if results:
239
286
  repo = results[0] # Take first match
240
-
287
+
241
288
  # Basic metadata from GitHub
242
289
  return {
243
290
  "name": crate_name,
@@ -249,8 +296,7 @@ class CrateAPIClient:
249
296
  "readme": "",
250
297
  "downloads": 0,
251
298
  "github_stars": repo.get("stargazers_count", 0),
252
- "dependencies": [],
253
- "code_snippets": [],
299
+ "dependencies": [], "code_snippets": [],
254
300
  "features": [],
255
301
  "readme_sections": {},
256
302
  "source": "github",
@@ -260,48 +306,3 @@ class CrateAPIClient:
260
306
 
261
307
  # If all sources fail
262
308
  return None
263
-
264
- def extract_code_snippets(self, readme: str) -> List[str]:
265
- """Extract code snippets from markdown README"""
266
- snippets = []
267
- if not readme:
268
- return snippets
269
-
270
- # Find Rust code blocks
271
- pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
272
- matches = re.findall(pattern, readme)
273
-
274
- for code in matches:
275
- if len(code.strip()) > 10: # Only include non-trivial snippets
276
- snippets.append(code.strip())
277
-
278
- return snippets[:5] # Limit to 5 snippets
279
-
280
- def extract_readme_sections(self, readme: str) -> Dict[str, str]:
281
- """Extract sections from README based on markdown headers"""
282
- if not readme:
283
- return {}
284
-
285
- sections = {}
286
- lines = readme.split('\n')
287
- current_section = ""
288
- current_content = []
289
-
290
- for line in lines:
291
- if re.match(r'^#+\s+', line): # It's a header
292
- # Save previous section
293
- if current_section and current_content:
294
- sections[current_section] = '\n'.join(current_content).strip()
295
-
296
- # Start new section
297
- current_section = re.sub(r'^#+\s+', '', line).strip()
298
- current_content = []
299
- else:
300
- if current_section: # Only collect content if we have a section
301
- current_content.append(line)
302
-
303
- # Don't forget the last section
304
- if current_section and current_content:
305
- sections[current_section] = '\n'.join(current_content).strip()
306
-
307
- return sections