skillnet-ai 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skillnet_ai/creator.py ADDED
@@ -0,0 +1,1026 @@
1
+ import os
2
+ import json
3
+ import re
4
+ import ast
5
+ import logging
6
+ from typing import List, Optional, Dict, Any
7
+
8
+ import requests
9
+ from openai import OpenAI
10
+ from skillnet_ai.prompts import (
11
+ CANDIDATE_METADATA_SYSTEM_PROMPT,
12
+ CANDIDATE_METADATA_USER_PROMPT_TEMPLATE,
13
+ SKILL_CONTENT_SYSTEM_PROMPT,
14
+ SKILL_CONTENT_USER_PROMPT_TEMPLATE,
15
+ GITHUB_SKILL_SYSTEM_PROMPT,
16
+ GITHUB_SKILL_USER_PROMPT_TEMPLATE,
17
+ OFFICE_SKILL_SYSTEM_PROMPT,
18
+ OFFICE_SKILL_USER_PROMPT_TEMPLATE,
19
+ PROMPT_SKILL_SYSTEM_PROMPT,
20
+ PROMPT_SKILL_USER_PROMPT_TEMPLATE
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class SkillCreator:
26
+ """
27
+ Creates Skill packages from execution trajectories using OpenAI-compatible LLMs.
28
+ """
29
+
30
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None, model: str = "gpt-4o"):
31
+ self.api_key = api_key or os.getenv("API_KEY")
32
+ self.base_url = base_url or os.getenv("BASE_URL") or "https://api.openai.com/v1"
33
+ self.model = model
34
+
35
+ if not self.api_key:
36
+ raise ValueError("API Key is missing. Please provide it in init or set API_KEY environment variable.")
37
+
38
+ self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
39
+
40
+ def _get_llm_response(self, messages: List[dict], max_tokens: int = 4096) -> str:
41
+ """Helper to call LLM and get string content."""
42
+ try:
43
+ response = self.client.chat.completions.create(
44
+ model=self.model,
45
+ messages=messages,
46
+ max_tokens=max_tokens
47
+ )
48
+ return response.choices[0].message.content
49
+ except Exception as e:
50
+ logger.error(f"LLM Call Failed: {e}")
51
+ raise
52
+
53
+ def create_from_trajectory(self, trajectory: str, output_dir: str = ".") -> List[str]:
54
+ """
55
+ Main entry point: Analyze trajectory and create skill files.
56
+
57
+ Args:
58
+ trajectory: The string content of the user's action log/trajectory.
59
+ output_dir: The directory where skills should be saved.
60
+
61
+ Returns:
62
+ List of paths to the created skill directories.
63
+ """
64
+ logger.info("Step 1: Analyzing trajectory to identify skills...")
65
+
66
+ # 1. Create Metadata
67
+ meta_messages = [
68
+ {"role": "system", "content": CANDIDATE_METADATA_SYSTEM_PROMPT},
69
+ {"role": "user", "content": CANDIDATE_METADATA_USER_PROMPT_TEMPLATE.format(trajectory=trajectory)}
70
+ ]
71
+
72
+ raw_meta_response = self._get_llm_response(meta_messages)
73
+ candidates = self._parse_candidate_metadata(raw_meta_response)
74
+
75
+ if not candidates:
76
+ logger.warning("No skills identified in the trajectory.")
77
+ return []
78
+
79
+ created_paths = []
80
+
81
+ # 2. Create Content for each candidate
82
+ for cand in candidates:
83
+ name = cand.get("name")
84
+ description = cand.get("description")
85
+ logger.info(f"Creating content for skill: {name}...")
86
+
87
+ content_messages = [
88
+ {"role": "system", "content": SKILL_CONTENT_SYSTEM_PROMPT},
89
+ {"role": "user", "content": SKILL_CONTENT_USER_PROMPT_TEMPLATE.format(
90
+ trajectory=trajectory, name=name, description=description
91
+ )}
92
+ ]
93
+
94
+ raw_content_response = self._get_llm_response(content_messages)
95
+
96
+ # 3. Parse and Save Files
97
+ self._save_skill_files(raw_content_response, output_dir)
98
+ created_paths.append(os.path.join(output_dir, name))
99
+
100
+ return created_paths
101
+
102
+ def _parse_candidate_metadata(self, llm_output: str) -> List[dict]:
103
+ """Extract JSON from the LLM output tags."""
104
+ try:
105
+ # Look for content between <Skill_Candidate_Metadata> tags
106
+ if "<Skill_Candidate_Metadata>" in llm_output:
107
+ json_str = llm_output.split("<Skill_Candidate_Metadata>")[1].split("</Skill_Candidate_Metadata>")[0]
108
+ else:
109
+ # Fallback: try to find the first JSON list block
110
+ json_str = llm_output
111
+
112
+ # clean markdown code blocks if present
113
+ json_str = json_str.replace("```json", "").replace("```", "").strip()
114
+ return json.loads(json_str)
115
+ except Exception as e:
116
+ logger.error(f"Failed to parse metadata JSON: {e}")
117
+ return []
118
+
119
+ def _save_skill_files(self, llm_output: str, output_base_dir: str) -> List[str]:
120
+ """Parse the FILE blocks and write them to disk."""
121
+ # Regex to find: ## FILE: path \n ```lang \n content \n ```
122
+ pattern = re.compile(r'##\s*FILE:\s*(.+?)\s*\n```(?:\w*)\n(.*?)```', re.DOTALL)
123
+ matches = pattern.findall(llm_output)
124
+
125
+ created_files = []
126
+
127
+ if not matches:
128
+ logger.warning("No file blocks found in LLM output.")
129
+ return created_files
130
+
131
+ for file_path, content in matches:
132
+ file_path = file_path.strip()
133
+ full_path = os.path.join(output_base_dir, file_path)
134
+
135
+ # Create directory if missing
136
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
137
+
138
+ try:
139
+ with open(full_path, 'w', encoding='utf-8') as f:
140
+ f.write(content)
141
+ logger.info(f"Saved: {full_path}")
142
+ created_files.append(full_path)
143
+ except IOError as e:
144
+ logger.error(f"Failed to write {full_path}: {e}")
145
+
146
+ return created_files
147
+
148
+ def create_from_office(
149
+ self,
150
+ file_path: str,
151
+ output_dir: str = "./generated_skills"
152
+ ) -> List[str]:
153
+ """
154
+ Create a skill package from an Office document (PDF, PPT, Word).
155
+
156
+ Args:
157
+ file_path: Path to the office document
158
+ output_dir: Directory where new skills will be saved
159
+
160
+ Returns:
161
+ List of paths to created skill directories
162
+ """
163
+ logger.info(f"Creating skill from office document: {file_path}")
164
+
165
+ if not os.path.exists(file_path):
166
+ raise FileNotFoundError(f"File not found: {file_path}")
167
+
168
+ if not _OfficeReader.is_supported(file_path):
169
+ raise ValueError(
170
+ f"Unsupported file type. Supported: {_OfficeReader.SUPPORTED_EXTENSIONS}"
171
+ )
172
+
173
+ # Extract text content
174
+ document_content = _OfficeReader.extract_text(file_path)
175
+ if not document_content.strip():
176
+ logger.warning("No text content extracted from document")
177
+ return []
178
+
179
+ filename = os.path.basename(file_path)
180
+ file_type = _OfficeReader.get_file_type(file_path)
181
+
182
+ # Generate skill using LLM
183
+ user_prompt = OFFICE_SKILL_USER_PROMPT_TEMPLATE.format(
184
+ filename=filename,
185
+ file_type=file_type,
186
+ document_content=document_content
187
+ )
188
+
189
+ messages = [
190
+ {"role": "system", "content": OFFICE_SKILL_SYSTEM_PROMPT},
191
+ {"role": "user", "content": user_prompt}
192
+ ]
193
+
194
+ try:
195
+ response = self._get_llm_response(messages, max_tokens=8192)
196
+ created_files = self._save_github_skill_files(response, output_dir)
197
+
198
+ # Extract unique skill directories
199
+ skill_dirs = set()
200
+ for created_file in created_files:
201
+ rel_path = os.path.relpath(created_file, output_dir)
202
+ skill_dir = rel_path.split(os.sep)[0]
203
+ skill_dirs.add(os.path.join(output_dir, skill_dir))
204
+
205
+ logger.info(f"Skill created from office document: {file_path}")
206
+ return list(skill_dirs)
207
+
208
+ except Exception as e:
209
+ logger.error(f"Failed to create skill from office document: {e}")
210
+ return []
211
+
212
+ def create_from_prompt(
213
+ self,
214
+ user_input: str,
215
+ output_dir: str = "./generated_skills"
216
+ ) -> List[str]:
217
+ """
218
+ Create a skill package from user's direct description.
219
+
220
+ Args:
221
+ user_input: User's description of the skill to create
222
+ output_dir: Directory where new skills will be saved
223
+
224
+ Returns:
225
+ List of paths to created skill directories
226
+ """
227
+ logger.info("Creating skill from user prompt")
228
+
229
+ if not user_input or not user_input.strip():
230
+ raise ValueError("User input cannot be empty")
231
+
232
+ # Generate skill using LLM
233
+ user_prompt = PROMPT_SKILL_USER_PROMPT_TEMPLATE.format(
234
+ user_input=user_input
235
+ )
236
+
237
+ messages = [
238
+ {"role": "system", "content": PROMPT_SKILL_SYSTEM_PROMPT},
239
+ {"role": "user", "content": user_prompt}
240
+ ]
241
+
242
+ try:
243
+ response = self._get_llm_response(messages, max_tokens=8192)
244
+ created_files = self._save_github_skill_files(response, output_dir)
245
+
246
+ # Extract unique skill directories
247
+ skill_dirs = set()
248
+ for created_file in created_files:
249
+ rel_path = os.path.relpath(created_file, output_dir)
250
+ skill_dir = rel_path.split(os.sep)[0]
251
+ skill_dirs.add(os.path.join(output_dir, skill_dir))
252
+
253
+ logger.info("Skill created from user input")
254
+ return list(skill_dirs)
255
+
256
+ except Exception as e:
257
+ logger.error(f"Failed to create skill from user input: {e}")
258
+ return []
259
+
260
+ def create_from_github(
261
+ self,
262
+ github_url: str,
263
+ output_dir: str = "./generated_skills",
264
+ api_token: Optional[str] = None,
265
+ max_files: int = 20
266
+ ) -> List[str]:
267
+ """Create a skill package from a GitHub repository."""
268
+ logger.info(f"Creating skill from GitHub: {github_url}")
269
+
270
+ try:
271
+
272
+ fetcher = _GitHubFetcher(api_token=api_token)
273
+ owner, repo, branch, _ = fetcher.parse_github_url(github_url)
274
+ logger.info(f"Parsed: {owner}/{repo} @ {branch}")
275
+
276
+ # Fetch repository data
277
+ repo_data = self._fetch_github_repo_data(fetcher, owner, repo, branch, max_files)
278
+
279
+ if not repo_data:
280
+ logger.error("Failed to fetch repository data")
281
+ return []
282
+
283
+ # Generate skill content
284
+ skill_content = self._generate_github_skill_content(repo_data)
285
+ if not skill_content:
286
+ logger.error("Failed to generate skill content")
287
+ return []
288
+
289
+ # Save skill package
290
+ skill_name = repo_data["metadata"]["name"].lower().replace(" ", "-").replace("_", "-")
291
+ created_files = self._save_github_skill_files(skill_content, output_dir)
292
+
293
+ # Extract unique skill directories from created files
294
+ skill_dirs = set()
295
+ for file_path in created_files:
296
+ rel_path = os.path.relpath(file_path, output_dir)
297
+ skill_dir = rel_path.split(os.sep)[0]
298
+ skill_dirs.add(os.path.join(output_dir, skill_dir))
299
+
300
+ logger.info(f"Skill created successfully from GitHub: {github_url}")
301
+ return list(skill_dirs) if skill_dirs else [os.path.join(output_dir, skill_name)]
302
+
303
+ except Exception as e:
304
+ logger.error(f"Failed to create skill from GitHub: {e}")
305
+ return []
306
+
307
+ def _fetch_github_repo_data(
308
+ self,
309
+ fetcher: "_GitHubFetcher",
310
+ owner: str,
311
+ repo: str,
312
+ branch: str,
313
+ max_files: int
314
+ ) -> Optional[Dict[str, Any]]:
315
+ """Fetch repository metadata, README, file tree, and code analysis."""
316
+ logger.info("Fetching repository data...")
317
+
318
+ metadata = fetcher.fetch_repo_metadata(owner, repo)
319
+ branch = metadata.get("default_branch", branch)
320
+ readme = fetcher.fetch_readme(owner, repo, branch)
321
+ file_tree = fetcher.fetch_file_tree(owner, repo, branch)
322
+ languages = fetcher.fetch_languages(owner, repo)
323
+ code_analysis = self._analyze_github_code_files(
324
+ fetcher, owner, repo, branch, file_tree, max_files
325
+ )
326
+
327
+ return {
328
+ "metadata": metadata,
329
+ "readme": readme,
330
+ "file_tree": file_tree,
331
+ "languages": languages,
332
+ "code_analysis": code_analysis,
333
+ "github_url": f"https://github.com/{owner}/{repo}"
334
+ }
335
+
336
+ def _analyze_github_code_files(
337
+ self,
338
+ fetcher: "_GitHubFetcher",
339
+ owner: str,
340
+ repo: str,
341
+ branch: str,
342
+ file_tree: List[Dict],
343
+ max_files: int
344
+ ) -> Dict[str, Any]:
345
+ """Analyze Python files to extract class and function signatures."""
346
+ logger.info("Analyzing Python files...")
347
+
348
+ # Filter Python files
349
+ py_files = [
350
+ f for f in file_tree
351
+ if f.get("type") == "file" and f.get("path", "").endswith(".py")
352
+ ][:max_files]
353
+
354
+ analyzed = []
355
+ for file_info in py_files:
356
+ file_path = file_info.get("path", "")
357
+ content = fetcher.fetch_file_content(owner, repo, file_path, branch)
358
+
359
+ if content:
360
+ analysis = _PythonCodeAnalyzer.analyze(content, file_path)
361
+ if analysis.get("classes") or analysis.get("functions"):
362
+ analyzed.append({"file": file_path, **analysis})
363
+ logger.debug(
364
+ f"Analyzed {file_path}: {len(analysis.get('classes', []))} classes, "
365
+ f"{len(analysis.get('functions', []))} functions"
366
+ )
367
+
368
+ total_classes = sum(len(f.get("classes", [])) for f in analyzed)
369
+ total_functions = sum(len(f.get("functions", [])) for f in analyzed)
370
+
371
+ logger.info(
372
+ f"Code analysis complete: {len(analyzed)} files, "
373
+ f"{total_classes} classes, {total_functions} functions"
374
+ )
375
+
376
+ return {
377
+ "files_analyzed": len(analyzed),
378
+ "total_classes": total_classes,
379
+ "total_functions": total_functions,
380
+ "files": analyzed
381
+ }
382
+
383
+ def _generate_github_skill_content(
384
+ self,
385
+ repo_data: Dict[str, Any],
386
+ max_retries: int = 2
387
+ ) -> Optional[str]:
388
+ """Generate skill content from repository data using LLM."""
389
+ logger.info("Generating skill content with LLM...")
390
+
391
+ metadata = repo_data["metadata"]
392
+ code_summary = self._build_code_summary(repo_data.get("code_analysis", {}))
393
+ file_tree_str = self._format_file_tree(repo_data.get("file_tree", [])[:100])
394
+
395
+ languages = repo_data.get("languages", {})
396
+ lang_str = ", ".join([f"{lang}: {pct}%" for lang, pct in languages.items()][:5])
397
+
398
+ readme_content = repo_data.get("readme") or "No README available"
399
+ readme_truncated = readme_content[:15000]
400
+
401
+ user_prompt = GITHUB_SKILL_USER_PROMPT_TEMPLATE.format(
402
+ repo_name=metadata.get("full_name", metadata.get("name", "unknown")),
403
+ repo_url=repo_data.get("github_url", ""),
404
+ repo_description=metadata.get("description") or "No description available",
405
+ language=metadata.get("language") or "Unknown",
406
+ languages_breakdown=lang_str or "N/A",
407
+ stars=metadata.get("stars", 0),
408
+ topics=", ".join(metadata.get("topics", [])) if metadata.get("topics") else "None",
409
+ readme_content=readme_truncated,
410
+ file_tree=file_tree_str,
411
+ code_summary=code_summary
412
+ )
413
+
414
+ messages = [
415
+ {"role": "system", "content": GITHUB_SKILL_SYSTEM_PROMPT},
416
+ {"role": "user", "content": user_prompt}
417
+ ]
418
+
419
+ # Retry mechanism with content validation
420
+ for attempt in range(max_retries + 1):
421
+ try:
422
+ # Use higher max_tokens for complete response
423
+ response = self._get_llm_response(messages, max_tokens=16384)
424
+
425
+ # Validate response has required content
426
+ if self._validate_skill_content(response):
427
+ return response
428
+
429
+ if attempt < max_retries:
430
+ logger.warning(f"Generated content incomplete, retrying ({attempt + 1}/{max_retries})...")
431
+ else:
432
+ logger.warning("Generated content may be incomplete, using best result.")
433
+ return response
434
+
435
+ except Exception as e:
436
+ logger.error(f"LLM call failed (attempt {attempt + 1}): {e}")
437
+ if attempt == max_retries:
438
+ return None
439
+
440
+ return None
441
+
442
+ def _validate_skill_content(self, content: str) -> bool:
443
+ """Check if generated content has required SKILL.md structure."""
444
+ if not content:
445
+ return False
446
+
447
+ has_skill_md = "SKILL.md" in content
448
+ has_file_block = "## FILE:" in content
449
+ has_frontmatter = "---" in content and "name:" in content
450
+ min_length = len(content) >= 1000
451
+
452
+ return has_skill_md and has_file_block and has_frontmatter and min_length
453
+
454
+ def _build_code_summary(self, code_analysis: Dict[str, Any]) -> str:
455
+ """Build code analysis summary for LLM prompt."""
456
+ if not code_analysis or not code_analysis.get("files"):
457
+ return "No Python code analysis available."
458
+
459
+ lines = [
460
+ f"Analyzed {code_analysis.get('files_analyzed', 0)} Python files:",
461
+ f"- Total Classes: {code_analysis.get('total_classes', 0)}",
462
+ f"- Total Functions: {code_analysis.get('total_functions', 0)}",
463
+ "",
464
+ "Key components found:"
465
+ ]
466
+
467
+ # Add top classes and functions
468
+ for file_data in code_analysis.get("files", [])[:5]:
469
+ file_path = file_data.get("file", "")
470
+ classes = file_data.get("classes", [])
471
+ functions = file_data.get("functions", [])
472
+
473
+ for cls in classes[:3]:
474
+ docstring = (cls.get("docstring") or "")[:100]
475
+ lines.append(f"- Class `{cls['name']}` in {file_path}")
476
+ if docstring:
477
+ lines.append(f" {docstring}...")
478
+
479
+ for func in functions[:3]:
480
+ params = func.get("parameters", [])[:3]
481
+ sig = f"{func['name']}({', '.join(params)})"
482
+ lines.append(f"- Function `{sig}` in {file_path}")
483
+
484
+ return "\n".join(lines[:30])
485
+
486
+ def _format_file_tree(self, file_tree: List[Dict]) -> str:
487
+ """Format file tree for LLM prompt."""
488
+ if not file_tree:
489
+ return "No file tree available."
490
+
491
+ lines = []
492
+ for item in file_tree[:50]:
493
+ path = item.get("path", "")
494
+ item_type = item.get("type", "file")
495
+ icon = "📁" if item_type == "dir" else "📄"
496
+ lines.append(f"{icon} {path}")
497
+
498
+ if len(file_tree) > 50:
499
+ lines.append(f"... and {len(file_tree) - 50} more files")
500
+
501
+ return "\n".join(lines)
502
+
503
+ def _save_github_skill_files(self, llm_output: str, output_base_dir: str) -> List[str]:
504
+ """Parse FILE blocks and write to disk, handling nested code blocks."""
505
+ created_files = []
506
+ parts = re.split(r'##\s*FILE:\s*', llm_output)
507
+
508
+ if len(parts) < 2:
509
+ logger.warning("No file blocks found in LLM output.")
510
+ return created_files
511
+
512
+ for part in parts[1:]:
513
+ lines = part.split('\n', 1)
514
+ if len(lines) < 2:
515
+ continue
516
+
517
+ file_path = lines[0].strip()
518
+ rest = lines[1]
519
+
520
+ match = re.match(r'```(?:\w*)\n', rest)
521
+ if not match:
522
+ continue
523
+
524
+ content_start = match.end()
525
+ content = rest[content_start:]
526
+
527
+ # Find closing ``` by tracking nested code blocks
528
+ in_nested_block = False
529
+ end_pos = -1
530
+ i = 0
531
+
532
+ while i < len(content):
533
+ # Check for ``` at start of line
534
+ if content[i:i+3] == '```' and (i == 0 or content[i-1] == '\n'):
535
+ if not in_nested_block:
536
+ after = content[i+3:i+50].split('\n')[0].strip()
537
+ if after == '':
538
+ end_pos = i
539
+ break
540
+ else:
541
+ in_nested_block = True
542
+ else:
543
+ in_nested_block = False
544
+ i += 1
545
+
546
+ if end_pos == -1:
547
+ end_pos = content.rfind('\n```')
548
+ if end_pos == -1:
549
+ end_pos = len(content)
550
+
551
+ file_content = content[:end_pos]
552
+ full_path = os.path.join(output_base_dir, file_path.strip())
553
+
554
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
555
+
556
+ try:
557
+ with open(full_path, 'w', encoding='utf-8') as f:
558
+ f.write(file_content)
559
+ logger.info(f"Saved: {full_path}")
560
+ created_files.append(full_path)
561
+ except IOError as e:
562
+ logger.error(f"Failed to write {full_path}: {e}")
563
+
564
+ return created_files
565
+
566
+
567
+ class _GitHubFetcher:
568
+ """Fetches content from GitHub repositories via API."""
569
+
570
+ EXCLUDED_DIRS = {
571
+ "node_modules", "__pycache__", ".git", ".venv", "venv",
572
+ "env", ".env", "build", "dist", ".pytest_cache",
573
+ ".mypy_cache", "htmlcov", ".tox", ".eggs"
574
+ }
575
+
576
+ def __init__(self, api_token: Optional[str] = None):
577
+ self.api_token = api_token or os.getenv("GITHUB_TOKEN")
578
+ self.session = requests.Session()
579
+ self.session.headers.update({
580
+ "Accept": "application/vnd.github.v3+json",
581
+ "User-Agent": "SkillNet-AI/1.0"
582
+ })
583
+ if self.api_token:
584
+ self.session.headers.update({"Authorization": f"token {self.api_token}"})
585
+
586
+ def _request_with_retry(
587
+ self,
588
+ url: str,
589
+ timeout: int = 10,
590
+ max_retries: int = 3,
591
+ base_delay: float = 1.0
592
+ ) -> Optional[requests.Response]:
593
+ """HTTP GET with exponential backoff and rate limit handling."""
594
+ import time
595
+
596
+ for attempt in range(1, max_retries + 1):
597
+ try:
598
+ response = self.session.get(url, timeout=timeout)
599
+
600
+ if response.status_code == 403:
601
+ remaining = response.headers.get("X-RateLimit-Remaining", "?")
602
+ if remaining == "0":
603
+ reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
604
+ wait_seconds = max(0, reset_time - int(time.time()))
605
+ logger.warning(f"GitHub rate limit exceeded. Resets in {wait_seconds}s")
606
+ if wait_seconds < 60:
607
+ time.sleep(wait_seconds + 1)
608
+ continue
609
+
610
+ return response
611
+
612
+ except requests.exceptions.Timeout:
613
+ if attempt < max_retries:
614
+ delay = base_delay * (2 ** (attempt - 1))
615
+ logger.warning(f"Timeout (attempt {attempt}/{max_retries}), retry in {delay:.1f}s")
616
+ time.sleep(delay)
617
+ else:
618
+ logger.error(f"Request failed after {max_retries} attempts: {url}")
619
+ return None
620
+
621
+ except requests.exceptions.ConnectionError:
622
+ if attempt < max_retries:
623
+ delay = base_delay * (2 ** (attempt - 1))
624
+ logger.warning(f"Connection error (attempt {attempt}/{max_retries}), retry in {delay:.1f}s")
625
+ time.sleep(delay)
626
+ else:
627
+ logger.error(f"Connection failed after {max_retries} attempts: {url}")
628
+ return None
629
+
630
+ except requests.exceptions.RequestException as e:
631
+ logger.error(f"Request failed: {e}")
632
+ return None
633
+
634
+ return None
635
+
636
+ def parse_github_url(self, url: str) -> tuple:
637
+ """Parse GitHub URL to extract owner, repo, branch, and optional path."""
638
+ url = url.rstrip("/")
639
+ if url.endswith(".git"):
640
+ url = url[:-4]
641
+
642
+ if "github.com/" in url:
643
+ parts = url.split("github.com/")[-1].split("/")
644
+ if len(parts) < 2:
645
+ raise ValueError(f"Invalid GitHub URL format: {url}")
646
+
647
+ owner, repo = parts[0], parts[1]
648
+ branch = "main"
649
+ path = ""
650
+
651
+ if len(parts) > 3 and parts[2] in ("tree", "blob"):
652
+ branch = parts[3]
653
+ path = "/".join(parts[4:]) if len(parts) > 4 else ""
654
+
655
+ return owner, repo, branch, path
656
+
657
+ raise ValueError(f"Invalid GitHub URL: {url}")
658
+
659
+ def fetch_repo_metadata(self, owner: str, repo: str) -> Dict[str, Any]:
660
+ """Fetch repository metadata from GitHub API."""
661
+ url = f"https://api.github.com/repos/{owner}/{repo}"
662
+
663
+ response = self._request_with_retry(url, timeout=10)
664
+ if response is None:
665
+ logger.warning("Failed to fetch repo metadata: request failed")
666
+ return {"name": repo, "full_name": f"{owner}/{repo}"}
667
+
668
+ try:
669
+ response.raise_for_status()
670
+ data = response.json()
671
+
672
+ return {
673
+ "name": data.get("name", repo),
674
+ "full_name": data.get("full_name", f"{owner}/{repo}"),
675
+ "description": data.get("description"),
676
+ "url": data.get("html_url"),
677
+ "homepage": data.get("homepage"),
678
+ "stars": data.get("stargazers_count", 0),
679
+ "forks": data.get("forks_count", 0),
680
+ "language": data.get("language"),
681
+ "topics": data.get("topics", []),
682
+ "license_name": data.get("license", {}).get("name") if data.get("license") else None,
683
+ "default_branch": data.get("default_branch", "main")
684
+ }
685
+ except requests.RequestException as e:
686
+ logger.warning(f"Failed to fetch repo metadata: {e}")
687
+ return {"name": repo, "full_name": f"{owner}/{repo}"}
688
+
689
+ def fetch_readme(self, owner: str, repo: str, branch: str = "main") -> Optional[str]:
690
+ """Fetch README content from repository."""
691
+ readme_names = ["README.md", "README.rst", "README.txt", "README"]
692
+
693
+ for readme_name in readme_names:
694
+ url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{readme_name}"
695
+ response = self._request_with_retry(url, timeout=10)
696
+ if response and response.status_code == 200:
697
+ logger.info(f"Found README: {readme_name}")
698
+ return response.text
699
+
700
+ logger.warning("No README found in repository")
701
+ return None
702
+
703
+ def fetch_file_tree(self, owner: str, repo: str, branch: str = "main") -> List[Dict]:
704
+ """Fetch repository file tree structure."""
705
+ url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
706
+
707
+ response = self._request_with_retry(url, timeout=15)
708
+ if response is None:
709
+ logger.warning("Failed to fetch file tree: request failed")
710
+ return []
711
+
712
+ try:
713
+ response.raise_for_status()
714
+ data = response.json()
715
+
716
+ file_tree = []
717
+ for item in data.get("tree", []):
718
+ path = item.get("path", "")
719
+
720
+ # Skip excluded directories
721
+ if any(excluded in path for excluded in self.EXCLUDED_DIRS):
722
+ continue
723
+
724
+ file_tree.append({
725
+ "path": path,
726
+ "type": "dir" if item.get("type") == "tree" else "file",
727
+ "size": item.get("size")
728
+ })
729
+
730
+ logger.info(f"Fetched file tree: {len(file_tree)} items")
731
+ return file_tree
732
+
733
+ except requests.RequestException as e:
734
+ logger.warning(f"Failed to fetch file tree: {e}")
735
+ return []
736
+
737
+ def fetch_languages(self, owner: str, repo: str) -> Dict[str, float]:
738
+ """Fetch language breakdown from GitHub API."""
739
+ url = f"https://api.github.com/repos/{owner}/{repo}/languages"
740
+
741
+ response = self._request_with_retry(url, timeout=10)
742
+ if response is None:
743
+ logger.warning("Failed to fetch languages: request failed")
744
+ return {}
745
+
746
+ try:
747
+ response.raise_for_status()
748
+ data = response.json()
749
+
750
+ if not data:
751
+ return {}
752
+
753
+ total_bytes = sum(data.values())
754
+ return {
755
+ lang: round((bytes_count / total_bytes) * 100, 2)
756
+ for lang, bytes_count in data.items()
757
+ }
758
+
759
+ except requests.RequestException as e:
760
+ logger.warning(f"Failed to fetch languages: {e}")
761
+ return {}
762
+
763
+ def fetch_file_content(
764
+ self, owner: str, repo: str, file_path: str, branch: str = "main"
765
+ ) -> Optional[str]:
766
+ """Fetch content of a specific file."""
767
+ url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
768
+
769
+ response = self._request_with_retry(url, timeout=10, max_retries=2)
770
+ if response and response.status_code == 200:
771
+ return response.text
772
+
773
+ return None
774
+
775
+
776
+ class _PythonCodeAnalyzer:
777
+ """Internal class for analyzing Python code using AST."""
778
+
779
+ @staticmethod
780
+ def analyze(content: str, file_path: str) -> Dict[str, Any]:
781
+ """Analyze Python file to extract classes and functions."""
782
+ try:
783
+ tree = ast.parse(content)
784
+ except SyntaxError as e:
785
+ logger.debug(f"Syntax error in {file_path}: {e}")
786
+ return {"classes": [], "functions": []}
787
+
788
+ classes = []
789
+ functions = []
790
+
791
+ for node in ast.iter_child_nodes(tree):
792
+ if isinstance(node, ast.ClassDef):
793
+ classes.append(_PythonCodeAnalyzer._extract_class(node))
794
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
795
+ functions.append(_PythonCodeAnalyzer._extract_function(node))
796
+
797
+ return {"classes": classes, "functions": functions}
798
+
799
+ @staticmethod
800
+ def _extract_class(node: ast.ClassDef) -> Dict[str, Any]:
801
+ """Extract class signature from AST node."""
802
+ base_classes = []
803
+ for base in node.bases:
804
+ if isinstance(base, ast.Name):
805
+ base_classes.append(base.id)
806
+ elif isinstance(base, ast.Attribute):
807
+ if isinstance(base.value, ast.Name):
808
+ base_classes.append(f"{base.value.id}.{base.attr}")
809
+ else:
810
+ base_classes.append(base.attr)
811
+
812
+ methods = []
813
+ for item in node.body:
814
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
815
+ methods.append(_PythonCodeAnalyzer._extract_function(item))
816
+
817
+ docstring = ast.get_docstring(node)
818
+ return {
819
+ "name": node.name,
820
+ "base_classes": base_classes,
821
+ "docstring": docstring[:200] if docstring else None,
822
+ "methods": methods
823
+ }
824
+
825
+ @staticmethod
826
+ def _extract_function(node) -> Dict[str, Any]:
827
+ """Extract function signature from AST node."""
828
+ params = []
829
+ for arg in node.args.args:
830
+ param_name = arg.arg
831
+ if arg.annotation:
832
+ try:
833
+ param_name += f": {ast.unparse(arg.annotation)}"
834
+ except Exception:
835
+ pass
836
+ params.append(param_name)
837
+
838
+ return_type = None
839
+ if node.returns:
840
+ try:
841
+ return_type = ast.unparse(node.returns)
842
+ except Exception:
843
+ pass
844
+
845
+ decorators = []
846
+ for decorator in node.decorator_list:
847
+ try:
848
+ decorators.append(ast.unparse(decorator))
849
+ except Exception:
850
+ if isinstance(decorator, ast.Name):
851
+ decorators.append(decorator.id)
852
+
853
+ docstring = ast.get_docstring(node)
854
+ return {
855
+ "name": node.name,
856
+ "parameters": params,
857
+ "return_type": return_type,
858
+ "docstring": docstring[:200] if docstring else None,
859
+ "is_async": isinstance(node, ast.AsyncFunctionDef),
860
+ "decorators": decorators
861
+ }
862
+
863
+
864
+ class _OfficeReader:
865
+ """Extract text content from Office documents (PDF, PPT, Word)."""
866
+
867
+ SUPPORTED_EXTENSIONS = {'.pdf', '.docx', '.doc', '.pptx', '.ppt'}
868
+
869
+ @staticmethod
870
+ def is_supported(file_path: str) -> bool:
871
+ """Check if file type is supported."""
872
+ ext = os.path.splitext(file_path)[1].lower()
873
+ return ext in _OfficeReader.SUPPORTED_EXTENSIONS
874
+
875
+ @staticmethod
876
+ def get_file_type(file_path: str) -> str:
877
+ """Get human-readable file type."""
878
+ ext = os.path.splitext(file_path)[1].lower()
879
+ type_map = {
880
+ '.pdf': 'PDF Document',
881
+ '.docx': 'Word Document',
882
+ '.doc': 'Word Document (Legacy)',
883
+ '.pptx': 'PowerPoint Presentation',
884
+ '.ppt': 'PowerPoint Presentation (Legacy)'
885
+ }
886
+ return type_map.get(ext, 'Unknown')
887
+
888
+ @staticmethod
889
+ def extract_text(file_path: str, max_chars: int = 50000) -> str:
890
+ """
891
+ Extract text content from supported office documents.
892
+
893
+ Args:
894
+ file_path: Path to the office document
895
+ max_chars: Maximum characters to extract
896
+
897
+ Returns:
898
+ Extracted text content
899
+
900
+ Raises:
901
+ ValueError: If file type not supported
902
+ ImportError: If required library not installed
903
+ """
904
+ ext = os.path.splitext(file_path)[1].lower()
905
+
906
+ if ext == '.pdf':
907
+ return _OfficeReader._extract_pdf(file_path, max_chars)
908
+ elif ext in ('.docx', '.doc'):
909
+ return _OfficeReader._extract_word(file_path, max_chars)
910
+ elif ext in ('.pptx', '.ppt'):
911
+ return _OfficeReader._extract_ppt(file_path, max_chars)
912
+ else:
913
+ raise ValueError(f"Unsupported file type: {ext}")
914
+
915
+ @staticmethod
916
+ def _extract_pdf(file_path: str, max_chars: int) -> str:
917
+ """Extract text from PDF file."""
918
+ try:
919
+ from PyPDF2 import PdfReader
920
+ except ImportError:
921
+ raise ImportError(
922
+ "PyPDF2 is required for PDF extraction. "
923
+ "Install with: pip install PyPDF2"
924
+ )
925
+
926
+ try:
927
+ reader = PdfReader(file_path)
928
+ text_parts = []
929
+ total_chars = 0
930
+
931
+ for page in reader.pages:
932
+ page_text = page.extract_text() or ""
933
+ if total_chars + len(page_text) > max_chars:
934
+ remaining = max_chars - total_chars
935
+ text_parts.append(page_text[:remaining])
936
+ break
937
+ text_parts.append(page_text)
938
+ total_chars += len(page_text)
939
+
940
+ return "\n\n".join(text_parts)
941
+ except Exception as e:
942
+ logger.error(f"Failed to extract PDF text: {e}")
943
+ raise
944
+
945
+ @staticmethod
946
+ def _extract_word(file_path: str, max_chars: int) -> str:
947
+ """Extract text from Word document."""
948
+ try:
949
+ from docx import Document
950
+ except ImportError:
951
+ raise ImportError(
952
+ "python-docx is required for Word extraction. "
953
+ "Install with: pip install python-docx"
954
+ )
955
+
956
+ try:
957
+ doc = Document(file_path)
958
+ text_parts = []
959
+ total_chars = 0
960
+
961
+ for para in doc.paragraphs:
962
+ para_text = para.text.strip()
963
+ if not para_text:
964
+ continue
965
+ if total_chars + len(para_text) > max_chars:
966
+ remaining = max_chars - total_chars
967
+ text_parts.append(para_text[:remaining])
968
+ break
969
+ text_parts.append(para_text)
970
+ total_chars += len(para_text)
971
+
972
+ # Also extract text from tables
973
+ for table in doc.tables:
974
+ if total_chars >= max_chars:
975
+ break
976
+ for row in table.rows:
977
+ row_text = " | ".join(cell.text.strip() for cell in row.cells)
978
+ if total_chars + len(row_text) > max_chars:
979
+ break
980
+ text_parts.append(row_text)
981
+ total_chars += len(row_text)
982
+
983
+ return "\n\n".join(text_parts)
984
+ except Exception as e:
985
+ logger.error(f"Failed to extract Word text: {e}")
986
+ raise
987
+
988
+ @staticmethod
989
+ def _extract_ppt(file_path: str, max_chars: int) -> str:
990
+ """Extract text from PowerPoint presentation."""
991
+ try:
992
+ from pptx import Presentation
993
+ except ImportError:
994
+ raise ImportError(
995
+ "python-pptx is required for PowerPoint extraction. "
996
+ "Install with: pip install python-pptx"
997
+ )
998
+
999
+ try:
1000
+ prs = Presentation(file_path)
1001
+ text_parts = []
1002
+ total_chars = 0
1003
+
1004
+ for slide_num, slide in enumerate(prs.slides, 1):
1005
+ slide_texts = [f"--- Slide {slide_num} ---"]
1006
+
1007
+ for shape in slide.shapes:
1008
+ if not shape.has_text_frame:
1009
+ continue
1010
+ for paragraph in shape.text_frame.paragraphs:
1011
+ para_text = paragraph.text.strip()
1012
+ if para_text:
1013
+ slide_texts.append(para_text)
1014
+
1015
+ slide_content = "\n".join(slide_texts)
1016
+ if total_chars + len(slide_content) > max_chars:
1017
+ remaining = max_chars - total_chars
1018
+ text_parts.append(slide_content[:remaining])
1019
+ break
1020
+ text_parts.append(slide_content)
1021
+ total_chars += len(slide_content)
1022
+
1023
+ return "\n\n".join(text_parts)
1024
+ except Exception as e:
1025
+ logger.error(f"Failed to extract PowerPoint text: {e}")
1026
+ raise