skillnet-ai 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skillnet_ai/__init__.py +23 -0
- skillnet_ai/analyzer.py +222 -0
- skillnet_ai/cli.py +577 -0
- skillnet_ai/client.py +316 -0
- skillnet_ai/creator.py +1026 -0
- skillnet_ai/downloader.py +156 -0
- skillnet_ai/evaluator.py +1006 -0
- skillnet_ai/models.py +41 -0
- skillnet_ai/prompts.py +885 -0
- skillnet_ai/searcher.py +100 -0
- skillnet_ai-0.0.3.dist-info/METADATA +369 -0
- skillnet_ai-0.0.3.dist-info/RECORD +16 -0
- {skillnet_ai-0.0.1.dist-info → skillnet_ai-0.0.3.dist-info}/WHEEL +1 -1
- skillnet_ai-0.0.3.dist-info/entry_points.txt +2 -0
- skillnet_ai-0.0.3.dist-info/licenses/LICENSE +21 -0
- skillnet_ai-0.0.1.dist-info/METADATA +0 -20
- skillnet_ai-0.0.1.dist-info/RECORD +0 -5
- {skillnet_ai-0.0.1.dist-info → skillnet_ai-0.0.3.dist-info}/top_level.txt +0 -0
skillnet_ai/creator.py
ADDED
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import ast
|
|
5
|
+
import logging
|
|
6
|
+
from typing import List, Optional, Dict, Any
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from openai import OpenAI
|
|
10
|
+
from skillnet_ai.prompts import (
|
|
11
|
+
CANDIDATE_METADATA_SYSTEM_PROMPT,
|
|
12
|
+
CANDIDATE_METADATA_USER_PROMPT_TEMPLATE,
|
|
13
|
+
SKILL_CONTENT_SYSTEM_PROMPT,
|
|
14
|
+
SKILL_CONTENT_USER_PROMPT_TEMPLATE,
|
|
15
|
+
GITHUB_SKILL_SYSTEM_PROMPT,
|
|
16
|
+
GITHUB_SKILL_USER_PROMPT_TEMPLATE,
|
|
17
|
+
OFFICE_SKILL_SYSTEM_PROMPT,
|
|
18
|
+
OFFICE_SKILL_USER_PROMPT_TEMPLATE,
|
|
19
|
+
PROMPT_SKILL_SYSTEM_PROMPT,
|
|
20
|
+
PROMPT_SKILL_USER_PROMPT_TEMPLATE
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
class SkillCreator:
|
|
26
|
+
"""
|
|
27
|
+
Creates Skill packages from execution trajectories using OpenAI-compatible LLMs.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None, model: str = "gpt-4o"):
|
|
31
|
+
self.api_key = api_key or os.getenv("API_KEY")
|
|
32
|
+
self.base_url = base_url or os.getenv("BASE_URL") or "https://api.openai.com/v1"
|
|
33
|
+
self.model = model
|
|
34
|
+
|
|
35
|
+
if not self.api_key:
|
|
36
|
+
raise ValueError("API Key is missing. Please provide it in init or set API_KEY environment variable.")
|
|
37
|
+
|
|
38
|
+
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
|
39
|
+
|
|
40
|
+
def _get_llm_response(self, messages: List[dict], max_tokens: int = 4096) -> str:
|
|
41
|
+
"""Helper to call LLM and get string content."""
|
|
42
|
+
try:
|
|
43
|
+
response = self.client.chat.completions.create(
|
|
44
|
+
model=self.model,
|
|
45
|
+
messages=messages,
|
|
46
|
+
max_tokens=max_tokens
|
|
47
|
+
)
|
|
48
|
+
return response.choices[0].message.content
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error(f"LLM Call Failed: {e}")
|
|
51
|
+
raise
|
|
52
|
+
|
|
53
|
+
def create_from_trajectory(self, trajectory: str, output_dir: str = ".") -> List[str]:
|
|
54
|
+
"""
|
|
55
|
+
Main entry point: Analyze trajectory and create skill files.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
trajectory: The string content of the user's action log/trajectory.
|
|
59
|
+
output_dir: The directory where skills should be saved.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of paths to the created skill directories.
|
|
63
|
+
"""
|
|
64
|
+
logger.info("Step 1: Analyzing trajectory to identify skills...")
|
|
65
|
+
|
|
66
|
+
# 1. Create Metadata
|
|
67
|
+
meta_messages = [
|
|
68
|
+
{"role": "system", "content": CANDIDATE_METADATA_SYSTEM_PROMPT},
|
|
69
|
+
{"role": "user", "content": CANDIDATE_METADATA_USER_PROMPT_TEMPLATE.format(trajectory=trajectory)}
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
raw_meta_response = self._get_llm_response(meta_messages)
|
|
73
|
+
candidates = self._parse_candidate_metadata(raw_meta_response)
|
|
74
|
+
|
|
75
|
+
if not candidates:
|
|
76
|
+
logger.warning("No skills identified in the trajectory.")
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
created_paths = []
|
|
80
|
+
|
|
81
|
+
# 2. Create Content for each candidate
|
|
82
|
+
for cand in candidates:
|
|
83
|
+
name = cand.get("name")
|
|
84
|
+
description = cand.get("description")
|
|
85
|
+
logger.info(f"Creating content for skill: {name}...")
|
|
86
|
+
|
|
87
|
+
content_messages = [
|
|
88
|
+
{"role": "system", "content": SKILL_CONTENT_SYSTEM_PROMPT},
|
|
89
|
+
{"role": "user", "content": SKILL_CONTENT_USER_PROMPT_TEMPLATE.format(
|
|
90
|
+
trajectory=trajectory, name=name, description=description
|
|
91
|
+
)}
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
raw_content_response = self._get_llm_response(content_messages)
|
|
95
|
+
|
|
96
|
+
# 3. Parse and Save Files
|
|
97
|
+
self._save_skill_files(raw_content_response, output_dir)
|
|
98
|
+
created_paths.append(os.path.join(output_dir, name))
|
|
99
|
+
|
|
100
|
+
return created_paths
|
|
101
|
+
|
|
102
|
+
def _parse_candidate_metadata(self, llm_output: str) -> List[dict]:
|
|
103
|
+
"""Extract JSON from the LLM output tags."""
|
|
104
|
+
try:
|
|
105
|
+
# Look for content between <Skill_Candidate_Metadata> tags
|
|
106
|
+
if "<Skill_Candidate_Metadata>" in llm_output:
|
|
107
|
+
json_str = llm_output.split("<Skill_Candidate_Metadata>")[1].split("</Skill_Candidate_Metadata>")[0]
|
|
108
|
+
else:
|
|
109
|
+
# Fallback: try to find the first JSON list block
|
|
110
|
+
json_str = llm_output
|
|
111
|
+
|
|
112
|
+
# clean markdown code blocks if present
|
|
113
|
+
json_str = json_str.replace("```json", "").replace("```", "").strip()
|
|
114
|
+
return json.loads(json_str)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f"Failed to parse metadata JSON: {e}")
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
def _save_skill_files(self, llm_output: str, output_base_dir: str) -> List[str]:
|
|
120
|
+
"""Parse the FILE blocks and write them to disk."""
|
|
121
|
+
# Regex to find: ## FILE: path \n ```lang \n content \n ```
|
|
122
|
+
pattern = re.compile(r'##\s*FILE:\s*(.+?)\s*\n```(?:\w*)\n(.*?)```', re.DOTALL)
|
|
123
|
+
matches = pattern.findall(llm_output)
|
|
124
|
+
|
|
125
|
+
created_files = []
|
|
126
|
+
|
|
127
|
+
if not matches:
|
|
128
|
+
logger.warning("No file blocks found in LLM output.")
|
|
129
|
+
return created_files
|
|
130
|
+
|
|
131
|
+
for file_path, content in matches:
|
|
132
|
+
file_path = file_path.strip()
|
|
133
|
+
full_path = os.path.join(output_base_dir, file_path)
|
|
134
|
+
|
|
135
|
+
# Create directory if missing
|
|
136
|
+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
with open(full_path, 'w', encoding='utf-8') as f:
|
|
140
|
+
f.write(content)
|
|
141
|
+
logger.info(f"Saved: {full_path}")
|
|
142
|
+
created_files.append(full_path)
|
|
143
|
+
except IOError as e:
|
|
144
|
+
logger.error(f"Failed to write {full_path}: {e}")
|
|
145
|
+
|
|
146
|
+
return created_files
|
|
147
|
+
|
|
148
|
+
def create_from_office(
|
|
149
|
+
self,
|
|
150
|
+
file_path: str,
|
|
151
|
+
output_dir: str = "./generated_skills"
|
|
152
|
+
) -> List[str]:
|
|
153
|
+
"""
|
|
154
|
+
Create a skill package from an Office document (PDF, PPT, Word).
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
file_path: Path to the office document
|
|
158
|
+
output_dir: Directory where new skills will be saved
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
List of paths to created skill directories
|
|
162
|
+
"""
|
|
163
|
+
logger.info(f"Creating skill from office document: {file_path}")
|
|
164
|
+
|
|
165
|
+
if not os.path.exists(file_path):
|
|
166
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
167
|
+
|
|
168
|
+
if not _OfficeReader.is_supported(file_path):
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Unsupported file type. Supported: {_OfficeReader.SUPPORTED_EXTENSIONS}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Extract text content
|
|
174
|
+
document_content = _OfficeReader.extract_text(file_path)
|
|
175
|
+
if not document_content.strip():
|
|
176
|
+
logger.warning("No text content extracted from document")
|
|
177
|
+
return []
|
|
178
|
+
|
|
179
|
+
filename = os.path.basename(file_path)
|
|
180
|
+
file_type = _OfficeReader.get_file_type(file_path)
|
|
181
|
+
|
|
182
|
+
# Generate skill using LLM
|
|
183
|
+
user_prompt = OFFICE_SKILL_USER_PROMPT_TEMPLATE.format(
|
|
184
|
+
filename=filename,
|
|
185
|
+
file_type=file_type,
|
|
186
|
+
document_content=document_content
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
messages = [
|
|
190
|
+
{"role": "system", "content": OFFICE_SKILL_SYSTEM_PROMPT},
|
|
191
|
+
{"role": "user", "content": user_prompt}
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
response = self._get_llm_response(messages, max_tokens=8192)
|
|
196
|
+
created_files = self._save_github_skill_files(response, output_dir)
|
|
197
|
+
|
|
198
|
+
# Extract unique skill directories
|
|
199
|
+
skill_dirs = set()
|
|
200
|
+
for created_file in created_files:
|
|
201
|
+
rel_path = os.path.relpath(created_file, output_dir)
|
|
202
|
+
skill_dir = rel_path.split(os.sep)[0]
|
|
203
|
+
skill_dirs.add(os.path.join(output_dir, skill_dir))
|
|
204
|
+
|
|
205
|
+
logger.info(f"Skill created from office document: {file_path}")
|
|
206
|
+
return list(skill_dirs)
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.error(f"Failed to create skill from office document: {e}")
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
def create_from_prompt(
|
|
213
|
+
self,
|
|
214
|
+
user_input: str,
|
|
215
|
+
output_dir: str = "./generated_skills"
|
|
216
|
+
) -> List[str]:
|
|
217
|
+
"""
|
|
218
|
+
Create a skill package from user's direct description.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
user_input: User's description of the skill to create
|
|
222
|
+
output_dir: Directory where new skills will be saved
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of paths to created skill directories
|
|
226
|
+
"""
|
|
227
|
+
logger.info("Creating skill from user prompt")
|
|
228
|
+
|
|
229
|
+
if not user_input or not user_input.strip():
|
|
230
|
+
raise ValueError("User input cannot be empty")
|
|
231
|
+
|
|
232
|
+
# Generate skill using LLM
|
|
233
|
+
user_prompt = PROMPT_SKILL_USER_PROMPT_TEMPLATE.format(
|
|
234
|
+
user_input=user_input
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
messages = [
|
|
238
|
+
{"role": "system", "content": PROMPT_SKILL_SYSTEM_PROMPT},
|
|
239
|
+
{"role": "user", "content": user_prompt}
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
response = self._get_llm_response(messages, max_tokens=8192)
|
|
244
|
+
created_files = self._save_github_skill_files(response, output_dir)
|
|
245
|
+
|
|
246
|
+
# Extract unique skill directories
|
|
247
|
+
skill_dirs = set()
|
|
248
|
+
for created_file in created_files:
|
|
249
|
+
rel_path = os.path.relpath(created_file, output_dir)
|
|
250
|
+
skill_dir = rel_path.split(os.sep)[0]
|
|
251
|
+
skill_dirs.add(os.path.join(output_dir, skill_dir))
|
|
252
|
+
|
|
253
|
+
logger.info("Skill created from user input")
|
|
254
|
+
return list(skill_dirs)
|
|
255
|
+
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Failed to create skill from user input: {e}")
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
def create_from_github(
|
|
261
|
+
self,
|
|
262
|
+
github_url: str,
|
|
263
|
+
output_dir: str = "./generated_skills",
|
|
264
|
+
api_token: Optional[str] = None,
|
|
265
|
+
max_files: int = 20
|
|
266
|
+
) -> List[str]:
|
|
267
|
+
"""Create a skill package from a GitHub repository."""
|
|
268
|
+
logger.info(f"Creating skill from GitHub: {github_url}")
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
|
|
272
|
+
fetcher = _GitHubFetcher(api_token=api_token)
|
|
273
|
+
owner, repo, branch, _ = fetcher.parse_github_url(github_url)
|
|
274
|
+
logger.info(f"Parsed: {owner}/{repo} @ {branch}")
|
|
275
|
+
|
|
276
|
+
# Fetch repository data
|
|
277
|
+
repo_data = self._fetch_github_repo_data(fetcher, owner, repo, branch, max_files)
|
|
278
|
+
|
|
279
|
+
if not repo_data:
|
|
280
|
+
logger.error("Failed to fetch repository data")
|
|
281
|
+
return []
|
|
282
|
+
|
|
283
|
+
# Generate skill content
|
|
284
|
+
skill_content = self._generate_github_skill_content(repo_data)
|
|
285
|
+
if not skill_content:
|
|
286
|
+
logger.error("Failed to generate skill content")
|
|
287
|
+
return []
|
|
288
|
+
|
|
289
|
+
# Save skill package
|
|
290
|
+
skill_name = repo_data["metadata"]["name"].lower().replace(" ", "-").replace("_", "-")
|
|
291
|
+
created_files = self._save_github_skill_files(skill_content, output_dir)
|
|
292
|
+
|
|
293
|
+
# Extract unique skill directories from created files
|
|
294
|
+
skill_dirs = set()
|
|
295
|
+
for file_path in created_files:
|
|
296
|
+
rel_path = os.path.relpath(file_path, output_dir)
|
|
297
|
+
skill_dir = rel_path.split(os.sep)[0]
|
|
298
|
+
skill_dirs.add(os.path.join(output_dir, skill_dir))
|
|
299
|
+
|
|
300
|
+
logger.info(f"Skill created successfully from GitHub: {github_url}")
|
|
301
|
+
return list(skill_dirs) if skill_dirs else [os.path.join(output_dir, skill_name)]
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.error(f"Failed to create skill from GitHub: {e}")
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
def _fetch_github_repo_data(
|
|
308
|
+
self,
|
|
309
|
+
fetcher: "_GitHubFetcher",
|
|
310
|
+
owner: str,
|
|
311
|
+
repo: str,
|
|
312
|
+
branch: str,
|
|
313
|
+
max_files: int
|
|
314
|
+
) -> Optional[Dict[str, Any]]:
|
|
315
|
+
"""Fetch repository metadata, README, file tree, and code analysis."""
|
|
316
|
+
logger.info("Fetching repository data...")
|
|
317
|
+
|
|
318
|
+
metadata = fetcher.fetch_repo_metadata(owner, repo)
|
|
319
|
+
branch = metadata.get("default_branch", branch)
|
|
320
|
+
readme = fetcher.fetch_readme(owner, repo, branch)
|
|
321
|
+
file_tree = fetcher.fetch_file_tree(owner, repo, branch)
|
|
322
|
+
languages = fetcher.fetch_languages(owner, repo)
|
|
323
|
+
code_analysis = self._analyze_github_code_files(
|
|
324
|
+
fetcher, owner, repo, branch, file_tree, max_files
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return {
|
|
328
|
+
"metadata": metadata,
|
|
329
|
+
"readme": readme,
|
|
330
|
+
"file_tree": file_tree,
|
|
331
|
+
"languages": languages,
|
|
332
|
+
"code_analysis": code_analysis,
|
|
333
|
+
"github_url": f"https://github.com/{owner}/{repo}"
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
def _analyze_github_code_files(
|
|
337
|
+
self,
|
|
338
|
+
fetcher: "_GitHubFetcher",
|
|
339
|
+
owner: str,
|
|
340
|
+
repo: str,
|
|
341
|
+
branch: str,
|
|
342
|
+
file_tree: List[Dict],
|
|
343
|
+
max_files: int
|
|
344
|
+
) -> Dict[str, Any]:
|
|
345
|
+
"""Analyze Python files to extract class and function signatures."""
|
|
346
|
+
logger.info("Analyzing Python files...")
|
|
347
|
+
|
|
348
|
+
# Filter Python files
|
|
349
|
+
py_files = [
|
|
350
|
+
f for f in file_tree
|
|
351
|
+
if f.get("type") == "file" and f.get("path", "").endswith(".py")
|
|
352
|
+
][:max_files]
|
|
353
|
+
|
|
354
|
+
analyzed = []
|
|
355
|
+
for file_info in py_files:
|
|
356
|
+
file_path = file_info.get("path", "")
|
|
357
|
+
content = fetcher.fetch_file_content(owner, repo, file_path, branch)
|
|
358
|
+
|
|
359
|
+
if content:
|
|
360
|
+
analysis = _PythonCodeAnalyzer.analyze(content, file_path)
|
|
361
|
+
if analysis.get("classes") or analysis.get("functions"):
|
|
362
|
+
analyzed.append({"file": file_path, **analysis})
|
|
363
|
+
logger.debug(
|
|
364
|
+
f"Analyzed {file_path}: {len(analysis.get('classes', []))} classes, "
|
|
365
|
+
f"{len(analysis.get('functions', []))} functions"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
total_classes = sum(len(f.get("classes", [])) for f in analyzed)
|
|
369
|
+
total_functions = sum(len(f.get("functions", [])) for f in analyzed)
|
|
370
|
+
|
|
371
|
+
logger.info(
|
|
372
|
+
f"Code analysis complete: {len(analyzed)} files, "
|
|
373
|
+
f"{total_classes} classes, {total_functions} functions"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
"files_analyzed": len(analyzed),
|
|
378
|
+
"total_classes": total_classes,
|
|
379
|
+
"total_functions": total_functions,
|
|
380
|
+
"files": analyzed
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
def _generate_github_skill_content(
|
|
384
|
+
self,
|
|
385
|
+
repo_data: Dict[str, Any],
|
|
386
|
+
max_retries: int = 2
|
|
387
|
+
) -> Optional[str]:
|
|
388
|
+
"""Generate skill content from repository data using LLM."""
|
|
389
|
+
logger.info("Generating skill content with LLM...")
|
|
390
|
+
|
|
391
|
+
metadata = repo_data["metadata"]
|
|
392
|
+
code_summary = self._build_code_summary(repo_data.get("code_analysis", {}))
|
|
393
|
+
file_tree_str = self._format_file_tree(repo_data.get("file_tree", [])[:100])
|
|
394
|
+
|
|
395
|
+
languages = repo_data.get("languages", {})
|
|
396
|
+
lang_str = ", ".join([f"{lang}: {pct}%" for lang, pct in languages.items()][:5])
|
|
397
|
+
|
|
398
|
+
readme_content = repo_data.get("readme") or "No README available"
|
|
399
|
+
readme_truncated = readme_content[:15000]
|
|
400
|
+
|
|
401
|
+
user_prompt = GITHUB_SKILL_USER_PROMPT_TEMPLATE.format(
|
|
402
|
+
repo_name=metadata.get("full_name", metadata.get("name", "unknown")),
|
|
403
|
+
repo_url=repo_data.get("github_url", ""),
|
|
404
|
+
repo_description=metadata.get("description") or "No description available",
|
|
405
|
+
language=metadata.get("language") or "Unknown",
|
|
406
|
+
languages_breakdown=lang_str or "N/A",
|
|
407
|
+
stars=metadata.get("stars", 0),
|
|
408
|
+
topics=", ".join(metadata.get("topics", [])) if metadata.get("topics") else "None",
|
|
409
|
+
readme_content=readme_truncated,
|
|
410
|
+
file_tree=file_tree_str,
|
|
411
|
+
code_summary=code_summary
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
messages = [
|
|
415
|
+
{"role": "system", "content": GITHUB_SKILL_SYSTEM_PROMPT},
|
|
416
|
+
{"role": "user", "content": user_prompt}
|
|
417
|
+
]
|
|
418
|
+
|
|
419
|
+
# Retry mechanism with content validation
|
|
420
|
+
for attempt in range(max_retries + 1):
|
|
421
|
+
try:
|
|
422
|
+
# Use higher max_tokens for complete response
|
|
423
|
+
response = self._get_llm_response(messages, max_tokens=16384)
|
|
424
|
+
|
|
425
|
+
# Validate response has required content
|
|
426
|
+
if self._validate_skill_content(response):
|
|
427
|
+
return response
|
|
428
|
+
|
|
429
|
+
if attempt < max_retries:
|
|
430
|
+
logger.warning(f"Generated content incomplete, retrying ({attempt + 1}/{max_retries})...")
|
|
431
|
+
else:
|
|
432
|
+
logger.warning("Generated content may be incomplete, using best result.")
|
|
433
|
+
return response
|
|
434
|
+
|
|
435
|
+
except Exception as e:
|
|
436
|
+
logger.error(f"LLM call failed (attempt {attempt + 1}): {e}")
|
|
437
|
+
if attempt == max_retries:
|
|
438
|
+
return None
|
|
439
|
+
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
def _validate_skill_content(self, content: str) -> bool:
|
|
443
|
+
"""Check if generated content has required SKILL.md structure."""
|
|
444
|
+
if not content:
|
|
445
|
+
return False
|
|
446
|
+
|
|
447
|
+
has_skill_md = "SKILL.md" in content
|
|
448
|
+
has_file_block = "## FILE:" in content
|
|
449
|
+
has_frontmatter = "---" in content and "name:" in content
|
|
450
|
+
min_length = len(content) >= 1000
|
|
451
|
+
|
|
452
|
+
return has_skill_md and has_file_block and has_frontmatter and min_length
|
|
453
|
+
|
|
454
|
+
def _build_code_summary(self, code_analysis: Dict[str, Any]) -> str:
|
|
455
|
+
"""Build code analysis summary for LLM prompt."""
|
|
456
|
+
if not code_analysis or not code_analysis.get("files"):
|
|
457
|
+
return "No Python code analysis available."
|
|
458
|
+
|
|
459
|
+
lines = [
|
|
460
|
+
f"Analyzed {code_analysis.get('files_analyzed', 0)} Python files:",
|
|
461
|
+
f"- Total Classes: {code_analysis.get('total_classes', 0)}",
|
|
462
|
+
f"- Total Functions: {code_analysis.get('total_functions', 0)}",
|
|
463
|
+
"",
|
|
464
|
+
"Key components found:"
|
|
465
|
+
]
|
|
466
|
+
|
|
467
|
+
# Add top classes and functions
|
|
468
|
+
for file_data in code_analysis.get("files", [])[:5]:
|
|
469
|
+
file_path = file_data.get("file", "")
|
|
470
|
+
classes = file_data.get("classes", [])
|
|
471
|
+
functions = file_data.get("functions", [])
|
|
472
|
+
|
|
473
|
+
for cls in classes[:3]:
|
|
474
|
+
docstring = (cls.get("docstring") or "")[:100]
|
|
475
|
+
lines.append(f"- Class `{cls['name']}` in {file_path}")
|
|
476
|
+
if docstring:
|
|
477
|
+
lines.append(f" {docstring}...")
|
|
478
|
+
|
|
479
|
+
for func in functions[:3]:
|
|
480
|
+
params = func.get("parameters", [])[:3]
|
|
481
|
+
sig = f"{func['name']}({', '.join(params)})"
|
|
482
|
+
lines.append(f"- Function `{sig}` in {file_path}")
|
|
483
|
+
|
|
484
|
+
return "\n".join(lines[:30])
|
|
485
|
+
|
|
486
|
+
def _format_file_tree(self, file_tree: List[Dict]) -> str:
|
|
487
|
+
"""Format file tree for LLM prompt."""
|
|
488
|
+
if not file_tree:
|
|
489
|
+
return "No file tree available."
|
|
490
|
+
|
|
491
|
+
lines = []
|
|
492
|
+
for item in file_tree[:50]:
|
|
493
|
+
path = item.get("path", "")
|
|
494
|
+
item_type = item.get("type", "file")
|
|
495
|
+
icon = "📁" if item_type == "dir" else "📄"
|
|
496
|
+
lines.append(f"{icon} {path}")
|
|
497
|
+
|
|
498
|
+
if len(file_tree) > 50:
|
|
499
|
+
lines.append(f"... and {len(file_tree) - 50} more files")
|
|
500
|
+
|
|
501
|
+
return "\n".join(lines)
|
|
502
|
+
|
|
503
|
+
def _save_github_skill_files(self, llm_output: str, output_base_dir: str) -> List[str]:
|
|
504
|
+
"""Parse FILE blocks and write to disk, handling nested code blocks."""
|
|
505
|
+
created_files = []
|
|
506
|
+
parts = re.split(r'##\s*FILE:\s*', llm_output)
|
|
507
|
+
|
|
508
|
+
if len(parts) < 2:
|
|
509
|
+
logger.warning("No file blocks found in LLM output.")
|
|
510
|
+
return created_files
|
|
511
|
+
|
|
512
|
+
for part in parts[1:]:
|
|
513
|
+
lines = part.split('\n', 1)
|
|
514
|
+
if len(lines) < 2:
|
|
515
|
+
continue
|
|
516
|
+
|
|
517
|
+
file_path = lines[0].strip()
|
|
518
|
+
rest = lines[1]
|
|
519
|
+
|
|
520
|
+
match = re.match(r'```(?:\w*)\n', rest)
|
|
521
|
+
if not match:
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
content_start = match.end()
|
|
525
|
+
content = rest[content_start:]
|
|
526
|
+
|
|
527
|
+
# Find closing ``` by tracking nested code blocks
|
|
528
|
+
in_nested_block = False
|
|
529
|
+
end_pos = -1
|
|
530
|
+
i = 0
|
|
531
|
+
|
|
532
|
+
while i < len(content):
|
|
533
|
+
# Check for ``` at start of line
|
|
534
|
+
if content[i:i+3] == '```' and (i == 0 or content[i-1] == '\n'):
|
|
535
|
+
if not in_nested_block:
|
|
536
|
+
after = content[i+3:i+50].split('\n')[0].strip()
|
|
537
|
+
if after == '':
|
|
538
|
+
end_pos = i
|
|
539
|
+
break
|
|
540
|
+
else:
|
|
541
|
+
in_nested_block = True
|
|
542
|
+
else:
|
|
543
|
+
in_nested_block = False
|
|
544
|
+
i += 1
|
|
545
|
+
|
|
546
|
+
if end_pos == -1:
|
|
547
|
+
end_pos = content.rfind('\n```')
|
|
548
|
+
if end_pos == -1:
|
|
549
|
+
end_pos = len(content)
|
|
550
|
+
|
|
551
|
+
file_content = content[:end_pos]
|
|
552
|
+
full_path = os.path.join(output_base_dir, file_path.strip())
|
|
553
|
+
|
|
554
|
+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
with open(full_path, 'w', encoding='utf-8') as f:
|
|
558
|
+
f.write(file_content)
|
|
559
|
+
logger.info(f"Saved: {full_path}")
|
|
560
|
+
created_files.append(full_path)
|
|
561
|
+
except IOError as e:
|
|
562
|
+
logger.error(f"Failed to write {full_path}: {e}")
|
|
563
|
+
|
|
564
|
+
return created_files
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
class _GitHubFetcher:
|
|
568
|
+
"""Fetches content from GitHub repositories via API."""
|
|
569
|
+
|
|
570
|
+
EXCLUDED_DIRS = {
|
|
571
|
+
"node_modules", "__pycache__", ".git", ".venv", "venv",
|
|
572
|
+
"env", ".env", "build", "dist", ".pytest_cache",
|
|
573
|
+
".mypy_cache", "htmlcov", ".tox", ".eggs"
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
def __init__(self, api_token: Optional[str] = None):
|
|
577
|
+
self.api_token = api_token or os.getenv("GITHUB_TOKEN")
|
|
578
|
+
self.session = requests.Session()
|
|
579
|
+
self.session.headers.update({
|
|
580
|
+
"Accept": "application/vnd.github.v3+json",
|
|
581
|
+
"User-Agent": "SkillNet-AI/1.0"
|
|
582
|
+
})
|
|
583
|
+
if self.api_token:
|
|
584
|
+
self.session.headers.update({"Authorization": f"token {self.api_token}"})
|
|
585
|
+
|
|
586
|
+
def _request_with_retry(
|
|
587
|
+
self,
|
|
588
|
+
url: str,
|
|
589
|
+
timeout: int = 10,
|
|
590
|
+
max_retries: int = 3,
|
|
591
|
+
base_delay: float = 1.0
|
|
592
|
+
) -> Optional[requests.Response]:
|
|
593
|
+
"""HTTP GET with exponential backoff and rate limit handling."""
|
|
594
|
+
import time
|
|
595
|
+
|
|
596
|
+
for attempt in range(1, max_retries + 1):
|
|
597
|
+
try:
|
|
598
|
+
response = self.session.get(url, timeout=timeout)
|
|
599
|
+
|
|
600
|
+
if response.status_code == 403:
|
|
601
|
+
remaining = response.headers.get("X-RateLimit-Remaining", "?")
|
|
602
|
+
if remaining == "0":
|
|
603
|
+
reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
|
|
604
|
+
wait_seconds = max(0, reset_time - int(time.time()))
|
|
605
|
+
logger.warning(f"GitHub rate limit exceeded. Resets in {wait_seconds}s")
|
|
606
|
+
if wait_seconds < 60:
|
|
607
|
+
time.sleep(wait_seconds + 1)
|
|
608
|
+
continue
|
|
609
|
+
|
|
610
|
+
return response
|
|
611
|
+
|
|
612
|
+
except requests.exceptions.Timeout:
|
|
613
|
+
if attempt < max_retries:
|
|
614
|
+
delay = base_delay * (2 ** (attempt - 1))
|
|
615
|
+
logger.warning(f"Timeout (attempt {attempt}/{max_retries}), retry in {delay:.1f}s")
|
|
616
|
+
time.sleep(delay)
|
|
617
|
+
else:
|
|
618
|
+
logger.error(f"Request failed after {max_retries} attempts: {url}")
|
|
619
|
+
return None
|
|
620
|
+
|
|
621
|
+
except requests.exceptions.ConnectionError:
|
|
622
|
+
if attempt < max_retries:
|
|
623
|
+
delay = base_delay * (2 ** (attempt - 1))
|
|
624
|
+
logger.warning(f"Connection error (attempt {attempt}/{max_retries}), retry in {delay:.1f}s")
|
|
625
|
+
time.sleep(delay)
|
|
626
|
+
else:
|
|
627
|
+
logger.error(f"Connection failed after {max_retries} attempts: {url}")
|
|
628
|
+
return None
|
|
629
|
+
|
|
630
|
+
except requests.exceptions.RequestException as e:
|
|
631
|
+
logger.error(f"Request failed: {e}")
|
|
632
|
+
return None
|
|
633
|
+
|
|
634
|
+
return None
|
|
635
|
+
|
|
636
|
+
def parse_github_url(self, url: str) -> tuple:
|
|
637
|
+
"""Parse GitHub URL to extract owner, repo, branch, and optional path."""
|
|
638
|
+
url = url.rstrip("/")
|
|
639
|
+
if url.endswith(".git"):
|
|
640
|
+
url = url[:-4]
|
|
641
|
+
|
|
642
|
+
if "github.com/" in url:
|
|
643
|
+
parts = url.split("github.com/")[-1].split("/")
|
|
644
|
+
if len(parts) < 2:
|
|
645
|
+
raise ValueError(f"Invalid GitHub URL format: {url}")
|
|
646
|
+
|
|
647
|
+
owner, repo = parts[0], parts[1]
|
|
648
|
+
branch = "main"
|
|
649
|
+
path = ""
|
|
650
|
+
|
|
651
|
+
if len(parts) > 3 and parts[2] in ("tree", "blob"):
|
|
652
|
+
branch = parts[3]
|
|
653
|
+
path = "/".join(parts[4:]) if len(parts) > 4 else ""
|
|
654
|
+
|
|
655
|
+
return owner, repo, branch, path
|
|
656
|
+
|
|
657
|
+
raise ValueError(f"Invalid GitHub URL: {url}")
|
|
658
|
+
|
|
659
|
+
def fetch_repo_metadata(self, owner: str, repo: str) -> Dict[str, Any]:
|
|
660
|
+
"""Fetch repository metadata from GitHub API."""
|
|
661
|
+
url = f"https://api.github.com/repos/{owner}/{repo}"
|
|
662
|
+
|
|
663
|
+
response = self._request_with_retry(url, timeout=10)
|
|
664
|
+
if response is None:
|
|
665
|
+
logger.warning("Failed to fetch repo metadata: request failed")
|
|
666
|
+
return {"name": repo, "full_name": f"{owner}/{repo}"}
|
|
667
|
+
|
|
668
|
+
try:
|
|
669
|
+
response.raise_for_status()
|
|
670
|
+
data = response.json()
|
|
671
|
+
|
|
672
|
+
return {
|
|
673
|
+
"name": data.get("name", repo),
|
|
674
|
+
"full_name": data.get("full_name", f"{owner}/{repo}"),
|
|
675
|
+
"description": data.get("description"),
|
|
676
|
+
"url": data.get("html_url"),
|
|
677
|
+
"homepage": data.get("homepage"),
|
|
678
|
+
"stars": data.get("stargazers_count", 0),
|
|
679
|
+
"forks": data.get("forks_count", 0),
|
|
680
|
+
"language": data.get("language"),
|
|
681
|
+
"topics": data.get("topics", []),
|
|
682
|
+
"license_name": data.get("license", {}).get("name") if data.get("license") else None,
|
|
683
|
+
"default_branch": data.get("default_branch", "main")
|
|
684
|
+
}
|
|
685
|
+
except requests.RequestException as e:
|
|
686
|
+
logger.warning(f"Failed to fetch repo metadata: {e}")
|
|
687
|
+
return {"name": repo, "full_name": f"{owner}/{repo}"}
|
|
688
|
+
|
|
689
|
+
def fetch_readme(self, owner: str, repo: str, branch: str = "main") -> Optional[str]:
|
|
690
|
+
"""Fetch README content from repository."""
|
|
691
|
+
readme_names = ["README.md", "README.rst", "README.txt", "README"]
|
|
692
|
+
|
|
693
|
+
for readme_name in readme_names:
|
|
694
|
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{readme_name}"
|
|
695
|
+
response = self._request_with_retry(url, timeout=10)
|
|
696
|
+
if response and response.status_code == 200:
|
|
697
|
+
logger.info(f"Found README: {readme_name}")
|
|
698
|
+
return response.text
|
|
699
|
+
|
|
700
|
+
logger.warning("No README found in repository")
|
|
701
|
+
return None
|
|
702
|
+
|
|
703
|
+
def fetch_file_tree(self, owner: str, repo: str, branch: str = "main") -> List[Dict]:
|
|
704
|
+
"""Fetch repository file tree structure."""
|
|
705
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
|
|
706
|
+
|
|
707
|
+
response = self._request_with_retry(url, timeout=15)
|
|
708
|
+
if response is None:
|
|
709
|
+
logger.warning("Failed to fetch file tree: request failed")
|
|
710
|
+
return []
|
|
711
|
+
|
|
712
|
+
try:
|
|
713
|
+
response.raise_for_status()
|
|
714
|
+
data = response.json()
|
|
715
|
+
|
|
716
|
+
file_tree = []
|
|
717
|
+
for item in data.get("tree", []):
|
|
718
|
+
path = item.get("path", "")
|
|
719
|
+
|
|
720
|
+
# Skip excluded directories
|
|
721
|
+
if any(excluded in path for excluded in self.EXCLUDED_DIRS):
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
file_tree.append({
|
|
725
|
+
"path": path,
|
|
726
|
+
"type": "dir" if item.get("type") == "tree" else "file",
|
|
727
|
+
"size": item.get("size")
|
|
728
|
+
})
|
|
729
|
+
|
|
730
|
+
logger.info(f"Fetched file tree: {len(file_tree)} items")
|
|
731
|
+
return file_tree
|
|
732
|
+
|
|
733
|
+
except requests.RequestException as e:
|
|
734
|
+
logger.warning(f"Failed to fetch file tree: {e}")
|
|
735
|
+
return []
|
|
736
|
+
|
|
737
|
+
def fetch_languages(self, owner: str, repo: str) -> Dict[str, float]:
|
|
738
|
+
"""Fetch language breakdown from GitHub API."""
|
|
739
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/languages"
|
|
740
|
+
|
|
741
|
+
response = self._request_with_retry(url, timeout=10)
|
|
742
|
+
if response is None:
|
|
743
|
+
logger.warning("Failed to fetch languages: request failed")
|
|
744
|
+
return {}
|
|
745
|
+
|
|
746
|
+
try:
|
|
747
|
+
response.raise_for_status()
|
|
748
|
+
data = response.json()
|
|
749
|
+
|
|
750
|
+
if not data:
|
|
751
|
+
return {}
|
|
752
|
+
|
|
753
|
+
total_bytes = sum(data.values())
|
|
754
|
+
return {
|
|
755
|
+
lang: round((bytes_count / total_bytes) * 100, 2)
|
|
756
|
+
for lang, bytes_count in data.items()
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
except requests.RequestException as e:
|
|
760
|
+
logger.warning(f"Failed to fetch languages: {e}")
|
|
761
|
+
return {}
|
|
762
|
+
|
|
763
|
+
def fetch_file_content(
|
|
764
|
+
self, owner: str, repo: str, file_path: str, branch: str = "main"
|
|
765
|
+
) -> Optional[str]:
|
|
766
|
+
"""Fetch content of a specific file."""
|
|
767
|
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
|
|
768
|
+
|
|
769
|
+
response = self._request_with_retry(url, timeout=10, max_retries=2)
|
|
770
|
+
if response and response.status_code == 200:
|
|
771
|
+
return response.text
|
|
772
|
+
|
|
773
|
+
return None
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
class _PythonCodeAnalyzer:
|
|
777
|
+
"""Internal class for analyzing Python code using AST."""
|
|
778
|
+
|
|
779
|
+
@staticmethod
|
|
780
|
+
def analyze(content: str, file_path: str) -> Dict[str, Any]:
|
|
781
|
+
"""Analyze Python file to extract classes and functions."""
|
|
782
|
+
try:
|
|
783
|
+
tree = ast.parse(content)
|
|
784
|
+
except SyntaxError as e:
|
|
785
|
+
logger.debug(f"Syntax error in {file_path}: {e}")
|
|
786
|
+
return {"classes": [], "functions": []}
|
|
787
|
+
|
|
788
|
+
classes = []
|
|
789
|
+
functions = []
|
|
790
|
+
|
|
791
|
+
for node in ast.iter_child_nodes(tree):
|
|
792
|
+
if isinstance(node, ast.ClassDef):
|
|
793
|
+
classes.append(_PythonCodeAnalyzer._extract_class(node))
|
|
794
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
795
|
+
functions.append(_PythonCodeAnalyzer._extract_function(node))
|
|
796
|
+
|
|
797
|
+
return {"classes": classes, "functions": functions}
|
|
798
|
+
|
|
799
|
+
@staticmethod
|
|
800
|
+
def _extract_class(node: ast.ClassDef) -> Dict[str, Any]:
|
|
801
|
+
"""Extract class signature from AST node."""
|
|
802
|
+
base_classes = []
|
|
803
|
+
for base in node.bases:
|
|
804
|
+
if isinstance(base, ast.Name):
|
|
805
|
+
base_classes.append(base.id)
|
|
806
|
+
elif isinstance(base, ast.Attribute):
|
|
807
|
+
if isinstance(base.value, ast.Name):
|
|
808
|
+
base_classes.append(f"{base.value.id}.{base.attr}")
|
|
809
|
+
else:
|
|
810
|
+
base_classes.append(base.attr)
|
|
811
|
+
|
|
812
|
+
methods = []
|
|
813
|
+
for item in node.body:
|
|
814
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
815
|
+
methods.append(_PythonCodeAnalyzer._extract_function(item))
|
|
816
|
+
|
|
817
|
+
docstring = ast.get_docstring(node)
|
|
818
|
+
return {
|
|
819
|
+
"name": node.name,
|
|
820
|
+
"base_classes": base_classes,
|
|
821
|
+
"docstring": docstring[:200] if docstring else None,
|
|
822
|
+
"methods": methods
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
@staticmethod
|
|
826
|
+
def _extract_function(node) -> Dict[str, Any]:
|
|
827
|
+
"""Extract function signature from AST node."""
|
|
828
|
+
params = []
|
|
829
|
+
for arg in node.args.args:
|
|
830
|
+
param_name = arg.arg
|
|
831
|
+
if arg.annotation:
|
|
832
|
+
try:
|
|
833
|
+
param_name += f": {ast.unparse(arg.annotation)}"
|
|
834
|
+
except Exception:
|
|
835
|
+
pass
|
|
836
|
+
params.append(param_name)
|
|
837
|
+
|
|
838
|
+
return_type = None
|
|
839
|
+
if node.returns:
|
|
840
|
+
try:
|
|
841
|
+
return_type = ast.unparse(node.returns)
|
|
842
|
+
except Exception:
|
|
843
|
+
pass
|
|
844
|
+
|
|
845
|
+
decorators = []
|
|
846
|
+
for decorator in node.decorator_list:
|
|
847
|
+
try:
|
|
848
|
+
decorators.append(ast.unparse(decorator))
|
|
849
|
+
except Exception:
|
|
850
|
+
if isinstance(decorator, ast.Name):
|
|
851
|
+
decorators.append(decorator.id)
|
|
852
|
+
|
|
853
|
+
docstring = ast.get_docstring(node)
|
|
854
|
+
return {
|
|
855
|
+
"name": node.name,
|
|
856
|
+
"parameters": params,
|
|
857
|
+
"return_type": return_type,
|
|
858
|
+
"docstring": docstring[:200] if docstring else None,
|
|
859
|
+
"is_async": isinstance(node, ast.AsyncFunctionDef),
|
|
860
|
+
"decorators": decorators
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
class _OfficeReader:
|
|
865
|
+
"""Extract text content from Office documents (PDF, PPT, Word)."""
|
|
866
|
+
|
|
867
|
+
SUPPORTED_EXTENSIONS = {'.pdf', '.docx', '.doc', '.pptx', '.ppt'}
|
|
868
|
+
|
|
869
|
+
@staticmethod
|
|
870
|
+
def is_supported(file_path: str) -> bool:
|
|
871
|
+
"""Check if file type is supported."""
|
|
872
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
873
|
+
return ext in _OfficeReader.SUPPORTED_EXTENSIONS
|
|
874
|
+
|
|
875
|
+
@staticmethod
|
|
876
|
+
def get_file_type(file_path: str) -> str:
|
|
877
|
+
"""Get human-readable file type."""
|
|
878
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
879
|
+
type_map = {
|
|
880
|
+
'.pdf': 'PDF Document',
|
|
881
|
+
'.docx': 'Word Document',
|
|
882
|
+
'.doc': 'Word Document (Legacy)',
|
|
883
|
+
'.pptx': 'PowerPoint Presentation',
|
|
884
|
+
'.ppt': 'PowerPoint Presentation (Legacy)'
|
|
885
|
+
}
|
|
886
|
+
return type_map.get(ext, 'Unknown')
|
|
887
|
+
|
|
888
|
+
@staticmethod
|
|
889
|
+
def extract_text(file_path: str, max_chars: int = 50000) -> str:
|
|
890
|
+
"""
|
|
891
|
+
Extract text content from supported office documents.
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
file_path: Path to the office document
|
|
895
|
+
max_chars: Maximum characters to extract
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Extracted text content
|
|
899
|
+
|
|
900
|
+
Raises:
|
|
901
|
+
ValueError: If file type not supported
|
|
902
|
+
ImportError: If required library not installed
|
|
903
|
+
"""
|
|
904
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
905
|
+
|
|
906
|
+
if ext == '.pdf':
|
|
907
|
+
return _OfficeReader._extract_pdf(file_path, max_chars)
|
|
908
|
+
elif ext in ('.docx', '.doc'):
|
|
909
|
+
return _OfficeReader._extract_word(file_path, max_chars)
|
|
910
|
+
elif ext in ('.pptx', '.ppt'):
|
|
911
|
+
return _OfficeReader._extract_ppt(file_path, max_chars)
|
|
912
|
+
else:
|
|
913
|
+
raise ValueError(f"Unsupported file type: {ext}")
|
|
914
|
+
|
|
915
|
+
@staticmethod
|
|
916
|
+
def _extract_pdf(file_path: str, max_chars: int) -> str:
|
|
917
|
+
"""Extract text from PDF file."""
|
|
918
|
+
try:
|
|
919
|
+
from PyPDF2 import PdfReader
|
|
920
|
+
except ImportError:
|
|
921
|
+
raise ImportError(
|
|
922
|
+
"PyPDF2 is required for PDF extraction. "
|
|
923
|
+
"Install with: pip install PyPDF2"
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
try:
|
|
927
|
+
reader = PdfReader(file_path)
|
|
928
|
+
text_parts = []
|
|
929
|
+
total_chars = 0
|
|
930
|
+
|
|
931
|
+
for page in reader.pages:
|
|
932
|
+
page_text = page.extract_text() or ""
|
|
933
|
+
if total_chars + len(page_text) > max_chars:
|
|
934
|
+
remaining = max_chars - total_chars
|
|
935
|
+
text_parts.append(page_text[:remaining])
|
|
936
|
+
break
|
|
937
|
+
text_parts.append(page_text)
|
|
938
|
+
total_chars += len(page_text)
|
|
939
|
+
|
|
940
|
+
return "\n\n".join(text_parts)
|
|
941
|
+
except Exception as e:
|
|
942
|
+
logger.error(f"Failed to extract PDF text: {e}")
|
|
943
|
+
raise
|
|
944
|
+
|
|
945
|
+
@staticmethod
|
|
946
|
+
def _extract_word(file_path: str, max_chars: int) -> str:
|
|
947
|
+
"""Extract text from Word document."""
|
|
948
|
+
try:
|
|
949
|
+
from docx import Document
|
|
950
|
+
except ImportError:
|
|
951
|
+
raise ImportError(
|
|
952
|
+
"python-docx is required for Word extraction. "
|
|
953
|
+
"Install with: pip install python-docx"
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
try:
|
|
957
|
+
doc = Document(file_path)
|
|
958
|
+
text_parts = []
|
|
959
|
+
total_chars = 0
|
|
960
|
+
|
|
961
|
+
for para in doc.paragraphs:
|
|
962
|
+
para_text = para.text.strip()
|
|
963
|
+
if not para_text:
|
|
964
|
+
continue
|
|
965
|
+
if total_chars + len(para_text) > max_chars:
|
|
966
|
+
remaining = max_chars - total_chars
|
|
967
|
+
text_parts.append(para_text[:remaining])
|
|
968
|
+
break
|
|
969
|
+
text_parts.append(para_text)
|
|
970
|
+
total_chars += len(para_text)
|
|
971
|
+
|
|
972
|
+
# Also extract text from tables
|
|
973
|
+
for table in doc.tables:
|
|
974
|
+
if total_chars >= max_chars:
|
|
975
|
+
break
|
|
976
|
+
for row in table.rows:
|
|
977
|
+
row_text = " | ".join(cell.text.strip() for cell in row.cells)
|
|
978
|
+
if total_chars + len(row_text) > max_chars:
|
|
979
|
+
break
|
|
980
|
+
text_parts.append(row_text)
|
|
981
|
+
total_chars += len(row_text)
|
|
982
|
+
|
|
983
|
+
return "\n\n".join(text_parts)
|
|
984
|
+
except Exception as e:
|
|
985
|
+
logger.error(f"Failed to extract Word text: {e}")
|
|
986
|
+
raise
|
|
987
|
+
|
|
988
|
+
@staticmethod
|
|
989
|
+
def _extract_ppt(file_path: str, max_chars: int) -> str:
|
|
990
|
+
"""Extract text from PowerPoint presentation."""
|
|
991
|
+
try:
|
|
992
|
+
from pptx import Presentation
|
|
993
|
+
except ImportError:
|
|
994
|
+
raise ImportError(
|
|
995
|
+
"python-pptx is required for PowerPoint extraction. "
|
|
996
|
+
"Install with: pip install python-pptx"
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
try:
|
|
1000
|
+
prs = Presentation(file_path)
|
|
1001
|
+
text_parts = []
|
|
1002
|
+
total_chars = 0
|
|
1003
|
+
|
|
1004
|
+
for slide_num, slide in enumerate(prs.slides, 1):
|
|
1005
|
+
slide_texts = [f"--- Slide {slide_num} ---"]
|
|
1006
|
+
|
|
1007
|
+
for shape in slide.shapes:
|
|
1008
|
+
if not shape.has_text_frame:
|
|
1009
|
+
continue
|
|
1010
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
1011
|
+
para_text = paragraph.text.strip()
|
|
1012
|
+
if para_text:
|
|
1013
|
+
slide_texts.append(para_text)
|
|
1014
|
+
|
|
1015
|
+
slide_content = "\n".join(slide_texts)
|
|
1016
|
+
if total_chars + len(slide_content) > max_chars:
|
|
1017
|
+
remaining = max_chars - total_chars
|
|
1018
|
+
text_parts.append(slide_content[:remaining])
|
|
1019
|
+
break
|
|
1020
|
+
text_parts.append(slide_content)
|
|
1021
|
+
total_chars += len(slide_content)
|
|
1022
|
+
|
|
1023
|
+
return "\n\n".join(text_parts)
|
|
1024
|
+
except Exception as e:
|
|
1025
|
+
logger.error(f"Failed to extract PowerPoint text: {e}")
|
|
1026
|
+
raise
|