rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -1,308 +1,317 @@
1
- # network.py
2
- import os
3
- import sys
4
- import re
5
- import time
6
- import logging
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import Dict, List, Optional
10
- from .config import PipelineConfig
11
-
12
- # Import utilities with fallback
13
- try:
14
- # Add the parent directory to the path to import utils
15
- sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
16
- from utils.http_client_utils import HTTPClientUtils, MetadataExtractor
17
- except ImportError:
18
- # Fallback implementations for when utils are not available
19
- class HTTPClientUtils:
20
- def __init__(self):
21
- pass
22
-
23
- class MetadataExtractor:
24
- def __init__(self):
25
- pass
26
-
27
- # Import atomic utilities for code reuse
28
- import sys
29
- sys.path.append(os.path.dirname(os.path.dirname(__file__)))
30
-
31
-
32
- class GitHubBatchClient:
33
- def __init__(self, config: PipelineConfig):
34
- self.config = config
35
- # Simple headers without dependency on HTTPClientUtils
36
- self.headers = {
37
- "Accept": "application/vnd.github.v3+json",
38
- "User-Agent": "SigilDERG-Data-Production/1.0"
39
- }
40
- if config.github_token:
41
- self.headers["Authorization"] = f"token {config.github_token}"
42
-
43
- # Simple session without dependency on HTTPClientUtils
44
- self.session = requests.Session()
45
- self.session.headers.update(self.headers)
46
- self.remaining_calls = 5000
47
- self.reset_time = 0
48
-
49
- def check_rate_limit(self):
50
- """Check and update current rate limit status"""
51
- try:
52
- response = self.session.get(
53
- "https://api.github.com/rate_limit",
54
- headers=self.headers)
55
- if response.ok:
56
- data = response.json()
57
- self.remaining_calls = data["resources"]["core"]["remaining"]
58
- self.reset_time = data["resources"]["core"]["reset"]
59
-
60
- if self.remaining_calls < 100:
61
- reset_in = self.reset_time - time.time()
62
- logging.warning(
63
- f"GitHub API rate limit low: {
64
- self.remaining_calls} remaining. Resets in {
65
- reset_in / 60:.1f} minutes")
66
- except Exception:
67
- pass
68
-
69
- def get_repo_stats(self, owner: str, repo: str) -> Dict:
70
- """Get repository statistics"""
71
- try:
72
- url = f"https://api.github.com/repos/{owner}/{repo}"
73
- response = self.session.get(url, headers=self.headers)
74
- if response.ok:
75
- return response.json()
76
- else:
77
- logging.warning(
78
- f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
79
- return {}
80
- except Exception as e:
81
- logging.error(f"Error fetching repo stats: {str(e)}")
82
- return {}
83
-
84
- def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
85
- """Get statistics for multiple repositories in a batch"""
86
- self.check_rate_limit()
87
-
88
- results = {}
89
- for repo_url in repo_list:
90
- # Extract owner/repo from URL
91
- match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
92
- if not match:
93
- continue
94
-
95
- owner, repo = match.groups()
96
- repo = repo.split('.')[0] # Remove .git extension if present
97
-
98
- # Get stats
99
- stats = self.get_repo_stats(owner, repo)
100
- results[repo_url] = stats
101
-
102
- # Be nice to GitHub API
103
- time.sleep(0.1)
104
- return results
105
-
106
-
107
- class CrateAPIClient:
108
- def __init__(self, config: PipelineConfig):
109
- self.config = config
110
- # Simple session without dependency on HTTPClientUtils
111
- self.session = requests.Session()
112
- self.session.headers.update({
113
- "User-Agent": "SigilDERG-Data-Production/1.0"
114
- })
115
-
116
- def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
117
- """Fetch metadata with retry logic"""
118
- for attempt in range(self.config.max_retries):
119
- try:
120
- return self._fetch_metadata(crate_name)
121
- except Exception as e:
122
- logging.warning(
123
- f"Attempt {
124
- attempt +
125
- 1} failed for {crate_name}: {
126
- str(e)}")
127
- wait = 2 ** attempt
128
- time.sleep(wait)
129
- return None
130
-
131
- def _fetch_metadata(self, crate_name: str) -> Optional[Dict]:
132
- """Enhanced metadata fetching that tries multiple sources"""
133
- # First try crates.io (primary source)
134
- try:
135
- r = self.session.get(
136
- f"https://crates.io/api/v1/crates/{crate_name}")
137
- if r.ok:
138
- data = r.json()
139
- crate_data = data["crate"]
140
- latest = crate_data["newest_version"]
141
-
142
- # Get readme
143
- readme_response = self.session.get(
144
- f"https://crates.io/api/v1/crates/{crate_name}/readme")
145
- readme = readme_response.text if readme_response.ok else ""
146
-
147
- # Get dependencies
148
- deps_response = self.session.get(
149
- f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
150
- deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
151
-
152
- # Get features - using the versions endpoint
153
- features = []
154
- versions_response = self.session.get(
155
- f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
156
- if versions_response.ok:
157
- version_data = versions_response.json().get("version", {})
158
- features_dict = version_data.get("features", {})
159
- features = [{"name": k, "dependencies": v}
160
- for k, v in features_dict.items()]
161
-
162
- # Repository info and GitHub stars
163
- repo = crate_data.get("repository", "")
164
- gh_stars = 0
165
-
166
- # Check if it's a GitHub repo
167
- if "github.com" in repo and self.config.github_token:
168
- match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
169
- if match:
170
- owner, repo_name = match.groups()
171
- repo_name = repo_name.split(
172
- '.')[0] # Handle .git extensions
173
- gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
174
- gh_headers = {
175
- "Authorization": f"token {
176
- self.config.github_token}"} if self.config.github_token else {}
177
- gh = self.session.get(gh_url, headers=gh_headers)
178
- if gh.ok:
179
- gh_data = gh.json()
180
- gh_stars = gh_data.get("stargazers_count", 0)
181
-
182
- # Check if it's hosted on lib.rs
183
- lib_rs_data = {}
184
- if "lib.rs" in repo:
185
- lib_rs_url = f"https://lib.rs/crates/{crate_name}"
186
- lib_rs_response = self.session.get(lib_rs_url)
187
- if lib_rs_response.ok:
188
- soup = BeautifulSoup(
189
- lib_rs_response.text, 'html.parser')
190
- # Get README from lib.rs if not already available
191
- if not readme:
192
- readme_div = soup.find('div', class_='readme')
193
- if readme_div:
194
- readme = readme_div.get_text(strip=True) # Get lib.rs specific stats
195
- stats_div = soup.find('div', class_='crate-stats')
196
- if stats_div:
197
- downloads_text = stats_div.find(
198
- string=re.compile(r'[\d,]+ downloads'))
199
- if downloads_text: lib_rs_data["librs_downloads"] = int(
200
- re.sub(r'[^\d]', '', str(downloads_text)))
201
-
202
- # Extract code snippets and sections (simplified)
203
- code_snippets = [] # Simplified - would normally extract from readme
204
- readme_sections = {} # Simplified - would normally parse sections
205
-
206
- result = {
207
- "name": crate_name,
208
- "version": latest,
209
- "description": crate_data.get("description", ""),
210
- "repository": repo,
211
- "keywords": crate_data.get("keywords", []),
212
- "categories": crate_data.get("categories", []),
213
- "readme": readme,
214
- "downloads": crate_data.get("downloads", 0),
215
- "github_stars": gh_stars,
216
- "dependencies": deps,
217
- "code_snippets": code_snippets,
218
- "features": features,
219
- "readme_sections": readme_sections,
220
- **lib_rs_data
221
- }
222
-
223
- return result
224
-
225
- except Exception as e:
226
- logging.error(
227
- f"Failed fetching metadata for {crate_name}: {
228
- str(e)}")
229
- raise
230
-
231
- # If crates.io fails, try lib.rs
232
- try:
233
- r = self.session.get(f"https://lib.rs/crates/{crate_name}")
234
- if r.ok:
235
- soup = BeautifulSoup(r.text, 'html.parser')
236
-
237
- # Extract metadata from lib.rs page
238
- name = soup.select_one('h1').text.strip(
239
- ) if soup.select_one('h1') else crate_name
240
-
241
- # Find description
242
- desc_elem = soup.select_one('.description')
243
- description = desc_elem.text.strip() if desc_elem else ""
244
-
245
- # Find repository link
246
- repo_link = None
247
- for a in soup.select('a'):
248
- if 'github.com' in a.get('hre', ''):
249
- repo_link = a['href']
250
- break
251
-
252
- # Basic metadata from lib.rs
253
- return {
254
- "name": name,
255
- "version": "latest", # lib.rs doesn't easily expose version
256
- "description": description,
257
- "repository": repo_link or "",
258
- "keywords": [],
259
- "categories": [],
260
- "readme": "",
261
- "downloads": 0,
262
- "github_stars": 0,
263
- "dependencies": [],
264
- "code_snippets": [],
265
- "features": [],
266
- "readme_sections": {},
267
- "source": "lib.rs",
268
- }
269
- except Exception:
270
- pass
271
-
272
- # Finally, try GitHub search
273
- try:
274
- # This is a simplification - GitHub's search API requires
275
- # authentication
276
- headers = {}
277
- if self.config.github_token:
278
- headers["Authorization"] = f"token {self.config.github_token}"
279
-
280
- search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
281
- r = requests.get(search_url, headers=headers)
282
-
283
- if r.ok:
284
- results = r.json().get("items", [])
285
- if results:
286
- repo = results[0] # Take first match
287
-
288
- # Basic metadata from GitHub
289
- return {
290
- "name": crate_name,
291
- "version": "unknown",
292
- "description": repo.get("description", ""),
293
- "repository": repo.get("html_url", ""),
294
- "keywords": [],
295
- "categories": [],
296
- "readme": "",
297
- "downloads": 0,
298
- "github_stars": repo.get("stargazers_count", 0),
299
- "dependencies": [], "code_snippets": [],
300
- "features": [],
301
- "readme_sections": {},
302
- "source": "github",
303
- }
304
- except Exception:
305
- pass
306
-
307
- # If all sources fail
308
- return None
1
+ # network.py
2
+ import os
3
+ import re
4
+ import sys
5
+ import time
6
+ import logging
7
+ import requests
8
+ from typing import Any, Dict, List, Optional, Union
9
+ from bs4 import BeautifulSoup, Tag
10
+ from .config import PipelineConfig
11
+
12
+
13
+ class GitHubBatchClient:
14
+ def __init__(self, config: PipelineConfig) -> None:
15
+ self.config = config
16
+ # Simple headers without dependency on HTTPClientUtils
17
+ self.headers = {
18
+ "Accept": "application/vnd.github.v3+json",
19
+ "User-Agent": "SigilDERG-Data-Production/1.3.2",
20
+ }
21
+ if config.github_token:
22
+ self.headers["Authorization"] = f"token {config.github_token}"
23
+
24
+ # Simple session without dependency on HTTPClientUtils
25
+ self.session = requests.Session()
26
+ self.session.headers.update(self.headers)
27
+ self.remaining_calls = 5000
28
+ self.reset_time = 0
29
+
30
+ def check_rate_limit(self) -> None:
31
+ """Check and update current rate limit status"""
32
+ try:
33
+ response = self.session.get(
34
+ "https://api.github.com/rate_limit", headers=self.headers
35
+ )
36
+ if response.ok:
37
+ data = response.json()
38
+ self.remaining_calls = data["resources"]["core"]["remaining"]
39
+ self.reset_time = data["resources"]["core"]["reset"]
40
+
41
+ if self.remaining_calls < 100:
42
+ reset_in = self.reset_time - time.time()
43
+ logging.warning(
44
+ f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in / 60:.1f} minutes"
45
+ )
46
+ except Exception:
47
+ pass
48
+
49
+ def get_repo_stats(self, owner: str, repo: str) -> "dict[str, Any]":
50
+ """Get repository statistics"""
51
+ try:
52
+ url = f"https://api.github.com/repos/{owner}/{repo}"
53
+ response = self.session.get(url, headers=self.headers)
54
+ if response.ok:
55
+ return response.json()
56
+ else:
57
+ logging.warning(
58
+ f"Failed to get repo stats for {owner}/{repo}: {response.status_code}"
59
+ )
60
+ return {}
61
+ except Exception as e:
62
+ logging.error(f"Error fetching repo stats: {str(e)}")
63
+ return {}
64
+
65
+ def batch_get_repo_stats(self, repo_list: "list[str]") -> "dict[str, dict[str, Any]]":
66
+ """Get statistics for multiple repositories in a batch"""
67
+ self.check_rate_limit()
68
+
69
+ results: "dict[str, dict[str, Any]]" = {}
70
+ for repo_url in repo_list:
71
+ # Extract owner/repo from URL
72
+ match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
73
+ if not match:
74
+ continue
75
+
76
+ owner, repo = match.groups()
77
+ repo = repo.split(".")[0] # Remove .git extension if present
78
+
79
+ # Get stats
80
+ stats = self.get_repo_stats(owner, repo)
81
+ results[repo_url] = stats
82
+
83
+ # Be nice to GitHub API
84
+ time.sleep(0.1)
85
+ return results
86
+
87
+
88
+ class CrateAPIClient:
89
+ def __init__(self, config: PipelineConfig) -> None:
90
+ self.config = config
91
+ # Simple session without dependency on HTTPClientUtils
92
+ self.session = requests.Session()
93
+ self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.3.2"})
94
+
95
+ def fetch_crate_metadata(self, crate_name: str) -> "dict[str, Any] | None":
96
+ """Fetch metadata with retry logic"""
97
+ for attempt in range(self.config.max_retries):
98
+ try:
99
+ return self._fetch_metadata(crate_name)
100
+ except Exception as e:
101
+ logging.warning(
102
+ f"Attempt {attempt + 1} failed for {crate_name}: {str(e)}"
103
+ )
104
+ wait = 2**attempt
105
+ time.sleep(wait)
106
+ return None
107
+
108
+ def _fetch_metadata(self, crate_name: str) -> "dict[str, Any] | None":
109
+ """Enhanced metadata fetching that tries multiple sources"""
110
+ # First try crates.io (primary source)
111
+ try:
112
+ r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
113
+ if r.ok:
114
+ data = r.json()
115
+ crate_data = data["crate"]
116
+ latest = crate_data["newest_version"]
117
+
118
+ # Get readme
119
+ readme_response = self.session.get(
120
+ f"https://crates.io/api/v1/crates/{crate_name}/readme"
121
+ )
122
+ readme = readme_response.text if readme_response.ok else ""
123
+
124
+ # Get dependencies
125
+ deps_url = (
126
+ f"https://crates.io/api/v1/crates/{crate_name}/"
127
+ f"{latest}/dependencies"
128
+ )
129
+ deps_response = self.session.get(deps_url)
130
+ deps: list[dict[str, Any]] = (
131
+ deps_response.json().get("dependencies", [])
132
+ if deps_response.ok
133
+ else []
134
+ )
135
+
136
+ # Get features - using the versions endpoint
137
+ features = []
138
+ versions_response = self.session.get(
139
+ f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
140
+ )
141
+ if versions_response.ok:
142
+ version_data = versions_response.json().get("version", {})
143
+ features_dict = version_data.get("features", {})
144
+ features = [
145
+ {"name": k, "dependencies": v} for k, v in features_dict.items()
146
+ ]
147
+
148
+ # Repository info and GitHub stars
149
+ repo = crate_data.get("repository", "")
150
+ gh_stars = 0
151
+
152
+ # Check if it's a GitHub repo
153
+ if "github.com" in repo and self.config.github_token:
154
+ match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
155
+ if match:
156
+ owner, repo_name = match.groups()
157
+ repo_name = repo_name.split(".")[0] # Handle .git extensions
158
+ gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
159
+ gh_headers: dict[str, str] = {}
160
+ if self.config.github_token:
161
+ gh_headers["Authorization"] = (
162
+ f"token {self.config.github_token}"
163
+ )
164
+
165
+ gh = self.session.get(gh_url, headers=gh_headers)
166
+ if gh.ok:
167
+ gh_data = gh.json()
168
+ gh_stars = gh_data.get("stargazers_count", 0)
169
+
170
+ # Check if it's hosted on lib.rs
171
+ lib_rs_data = {}
172
+ if "lib.rs" in repo:
173
+ lib_rs_url = f"https://lib.rs/crates/{crate_name}"
174
+ lib_rs_response = self.session.get(lib_rs_url)
175
+ if lib_rs_response.ok:
176
+ soup = BeautifulSoup(lib_rs_response.text, "html.parser")
177
+ # Get README from lib.rs if not already available
178
+ if not readme:
179
+ readme_div = soup.find("div", class_="readme")
180
+ if readme_div:
181
+ readme = readme_div.get_text(
182
+ strip=True
183
+ ) # Get lib.rs specific stats
184
+ stats_div = soup.find("div", class_="crate-stats")
185
+ if isinstance(stats_div, Tag):
186
+ downloads_text = stats_div.find(
187
+ string=re.compile(r"[\d,]+ downloads")
188
+ )
189
+ if downloads_text:
190
+ lib_rs_data["librs_downloads"] = int(
191
+ re.sub(r"[^\d]", "", str(downloads_text))
192
+ )
193
+
194
+ # Extract code snippets and sections (simplified)
195
+ code_snippets: list[str] = (
196
+ []
197
+ ) # Simplified - would normally extract from readme
198
+ readme_sections: dict[str, str] = (
199
+ {}
200
+ ) # Simplified - would normally parse sections
201
+
202
+ result: dict[str, Any] = {
203
+ "name": crate_name,
204
+ "version": latest,
205
+ "description": crate_data.get("description", ""),
206
+ "repository": repo,
207
+ "keywords": crate_data.get("keywords", []),
208
+ "categories": crate_data.get("categories", []),
209
+ "readme": readme,
210
+ "downloads": crate_data.get("downloads", 0),
211
+ "github_stars": gh_stars,
212
+ "dependencies": deps,
213
+ "code_snippets": code_snippets,
214
+ "features": features,
215
+ "readme_sections": readme_sections,
216
+ **lib_rs_data,
217
+ }
218
+
219
+ return result
220
+
221
+ except Exception as e:
222
+ logging.error(
223
+ f"Failed fetching metadata for {crate_name}: {str(e)}"
224
+ )
225
+ raise
226
+
227
+ # If crates.io fails, try lib.rs
228
+ try:
229
+ r = self.session.get(f"https://lib.rs/crates/{crate_name}")
230
+ if r.ok:
231
+ soup = BeautifulSoup(r.text, "html.parser")
232
+
233
+ # Extract metadata from lib.rs page
234
+ h1 = soup.select_one("h1")
235
+ name = h1.text.strip() if h1 else crate_name
236
+
237
+ # Find description
238
+ desc_elem = soup.select_one(".description")
239
+ description = desc_elem.text.strip() if desc_elem else ""
240
+
241
+ # Find repository link
242
+ repo_link: Union[str, None] = None
243
+ for a in soup.select("a"):
244
+ href = a.get("href")
245
+ if href and isinstance(href, str) and "github.com" in href:
246
+ repo_link = href
247
+ break
248
+
249
+ # Find keywords
250
+ keywords_elem = soup.select_one(".keywords")
251
+ keywords = (
252
+ [k.text.strip() for k in keywords_elem.find_all("a")]
253
+ if keywords_elem
254
+ else []
255
+ )
256
+
257
+ # Basic metadata from lib.rs
258
+ return {
259
+ "name": name,
260
+ "version": "latest", # lib.rs doesn't easily expose version
261
+ "description": description,
262
+ "repository": repo_link or "",
263
+ "keywords": keywords,
264
+ "categories": [],
265
+ "readme": "",
266
+ "downloads": 0,
267
+ "github_stars": 0,
268
+ "dependencies": [],
269
+ "code_snippets": [],
270
+ "features": [],
271
+ "readme_sections": {},
272
+ "source": "lib.rs",
273
+ }
274
+ except Exception:
275
+ pass
276
+
277
+ # Finally, try GitHub search
278
+ try:
279
+ # This is a simplification - GitHub's search API requires
280
+ # authentication
281
+ gh_search_headers: dict[str, str] = {}
282
+ if self.config.github_token:
283
+ gh_search_headers["Authorization"] = f"token {self.config.github_token}"
284
+
285
+ search_url = (
286
+ f"https://api.github.com/search/repositories?"
287
+ f"q={crate_name}+language:rust"
288
+ )
289
+ r = requests.get(search_url, headers=gh_search_headers)
290
+
291
+ if r.ok:
292
+ results = r.json().get("items", [])
293
+ if results:
294
+ repo = results[0] # Take first match
295
+
296
+ # Basic metadata from GitHub
297
+ return {
298
+ "name": crate_name,
299
+ "version": "unknown",
300
+ "description": repo.get("description", ""),
301
+ "repository": repo.get("html_url", ""),
302
+ "keywords": [],
303
+ "categories": [],
304
+ "readme": "",
305
+ "downloads": 0,
306
+ "github_stars": repo.get("stargazers_count", 0),
307
+ "dependencies": [],
308
+ "code_snippets": [],
309
+ "features": [],
310
+ "readme_sections": {},
311
+ "source": "github",
312
+ }
313
+ except Exception:
314
+ pass
315
+
316
+ # If all sources fail
317
+ return None